In [None]:

import pandas as pd

# Load the CSV data
df = pd.read_csv('data.csv')

# Display the first 5 rows of the dataframe
print(df.head())

# Display the last 5 rows of the dataframe
print(df.tail())

# Display the summary statistics of the dataframe
print(df.describe())

# Display the data types of each column in the dataframe
print(df.dtypes)


In [None]:

import pandas as pd
import numpy as np

# Sample data
data = {
    'name': ['100% Bran', '100% Natural Bran', 'All-Bran'],
    'mfr': ['N', 'Q', 'K'],
    'type': ['C', 'C', 'C'],
    'calories': [70, 120, 70],
    'protein': [4, 3, 4],
    'fat': [1, 5, 1],
    'sodium': [130, 15, 260],
    'fiber': [10.0, 2.0, 9.0],
    'carbo': [5.0, 8.0, 7.0],
    'sugars': [6, 8, 5],
    'potass': [280, 135, 320],
    'vitamins': [25, 0, 25],
    'shelf': [3, 3, 3],
    'weight': [1.0, 1.0, 1.0],
    'cups': [0.33, 1.0, 0.33],
    'rating': [68.402973, 33.983679, 59.425505]
}

# Create DataFrame
df = pd.DataFrame(data)

# Drop rows with missing values
df.dropna(inplace=True)

# Replace specific missing values
df['mfr'].replace('', np.nan, inplace=True)
df.dropna(subset=['mfr'], inplace=True)

# Fill missing values with mean for numeric columns
numeric_cols = ['calories', 'protein', 'fat', 'sodium', 'fiber', 'carbo', 'sugars', 'potass', 'vitamins', 'shelf', 'weight', 'cups', 'rating']
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill missing values with median for float64 columns
float_cols = ['fiber', 'carbo', 'weight', 'cups']
df[float_cols] = df[float_cols].fillna(df[float_cols].median())

# Print the cleaned DataFrame
print(df)


In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Sample Data
data = {
    'name': {'0': '100% Bran', '1': '100% Natural Bran', '2': 'All-Bran'},
    'mfr': {'0': 'N', '1': 'Q', '2': 'K'},
    'type': {'0': 'C', '1': 'C', '2': 'C'},
    'calories': {'0': 70, '1': 120, '2': 70},
    'protein': {'0': 4, '1': 3, '2': 4},
    'fat': {'0': 1, '1': 5, '2': 1},
    'sodium': {'0': 130, '1': 15, '2': 260},
    'fiber': {'0': 10.0, '1': 2.0, '2': 9.0},
    'carbo': {'0': 5.0, '1': 8.0, '2': 7.0},
    'sugars': {'0': 6, '1': 8, '2': 5},
    'potass': {'0': 280, '1': 135, '2': 320},
    'vitamins': {'0': 25, '1': 0, '2': 25},
    'shelf': {'0': 3, '1': 3, '2': 3},
    'weight': {'0': 1.0, '1': 1.0, '2': 1.0},
    'cups': {'0': 0.33, '1': 1.0, '2': 0.33},
    'rating': {'0': 68.402973, '1': 33.983679, '2': 59.425505}
}

# Create DataFrame
df = pd.DataFrame(data)

# Exploratory Data Analysis
print("Data Shape:", df.shape)
print("Data Columns:", df.columns)
print("Data Types:", df.dtypes)
print("Data Summary:\n", df.describe())

# Data Visualization
plt.figure(figsize=(10, 6))
sns.barplot(x='name', y='rating', data=df)
plt.title('Product Rating')
plt.xlabel('Product Name')
plt.ylabel('Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(x='mfr', y='rating', data=df)
plt.title('Manufacturer Rating')
plt.xlabel('Manufacturer')
plt.ylabel('Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(x='type', y='rating', data=df)
plt.title('Product Type Rating')
plt.xlabel('Product Type')
plt.ylabel('Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='calories', y='rating', data=df)
plt.title('Calories vs Rating')
plt.xlabel('Calories')
plt.ylabel('Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='protein', y='rating', data=df)
plt.title('Protein vs Rating')
plt.xlabel('Protein')
plt.ylabel('Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='fat', y='rating', data=df)
plt.title('Fat vs Rating')
plt.xlabel('Fat')
plt.ylabel('Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='sodium', y='rating', data=df)
plt.title('Sodium vs Rating')
plt.xlabel('Sodium')
plt.ylabel('Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='fiber', y='rating', data=df)
plt.title('Fiber vs Rating')
plt.xlabel('Fiber')
plt.ylabel('Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='carbo', y='rating', data=df)
plt.title('Carbohydrates vs Rating')
plt.xlabel('Carbohydrates')
plt.ylabel('Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='sugars', y='rating', data=df)
plt.title('Sugars vs Rating')
plt.xlabel('Sugars')
plt.ylabel('Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='potass', y='rating', data=df)
plt.title('Potassium vs Rating')
plt.xlabel('Potassium')
plt.ylabel('Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='vitamins', y='rating', data=df)
plt.title('Vitamins vs Rating')
plt.xlabel('Vitamins')
plt.ylabel('Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='shelf', y='rating', data=df)
plt.title('Shelf vs Rating')
plt.xlabel('Shelf')
plt.ylabel('Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='weight', y='rating', data=df)
plt.title('Weight vs Rating')
plt.xlabel('Weight')
plt.ylabel('Rating')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='cups', y='rating', data=df)
plt.title('Cups vs Rating')
plt.xlabel('Cups')
plt.ylabel('Rating')
plt.show()
