# 📘 Advanced Internship Assignment: Data Exploration & Analysis in Python

---

## 📦 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='whitegrid')

## 🧾 2. Load the Dataset (Big Mart Sales)

# Load the dataset
file_path = "dataset/big_mart_sales.csv"
df = pd.read_csv(file_path)
df.head()

In [None]:
# Load the dataset
df = pd.read_csv('big_mart_sales.csv')

### ✅ Task:
- Display the dataset info
- Count missing values
- Understand the basic structure

In [None]:
df.info()
df.isnull().sum()

## 🔧 3. Data Cleaning & Imputation

# Fill missing weights with mean
if 'Item_Weight' in df.columns:
    df['Item_Weight'].fillna(df['Item_Weight'].mean(), inplace=True)

# Fill outlet size with mode
if 'Outlet_Size' in df.columns:
    df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0], inplace=True)

In [None]:
# --- Data Cleaning & Imputation ---
# Fill missing Item_Weight with mean
if 'Item_Weight' in df.columns:
    df['Item_Weight'].fillna(df['Item_Weight'].mean(), inplace=True)

# Fill missing Outlet_Size with mode (most common value)
if 'Outlet_Size' in df.columns:
    df['Outlet_Size'].fillna(df['Outlet_Size'].mode()[0], inplace=True)

## 📊 4. Feature Engineering

# Create new features based on domain knowledge
if 'Item_Visibility' in df.columns:
    df['Item_Visibility_Bin'] = pd.cut(df['Item_Visibility'], bins=[-1, 0.02, 0.07, 0.2, 1.0], labels=['Low', 'Medium', 'High', 'Very High'])

# Normalize Item_MRP
if 'Item_MRP' in df.columns:
    df['Item_MRP_Normalized'] = (df['Item_MRP'] - df['Item_MRP'].min()) / (df['Item_MRP'].max() - df['Item_MRP'].min())

In [None]:
# --- Feature Engineering ---
# Create visibility bins (Low, Medium, High, Very High)
if 'Item_Visibility' in df.columns:
    df['Item_Visibility_Bin'] = pd.cut(df['Item_Visibility'],
                                       bins=[-1, 0.02, 0.07, 0.2, 1.0],
                                       labels=['Low', 'Medium', 'High', 'Very High'])

# Normalize Item_MRP (scale between 0 and 1)
if 'Item_MRP' in df.columns:
    df['Item_MRP_Normalized'] = (df['Item_MRP'] - df['Item_MRP'].min()) / (df['Item_MRP'].max() - df['Item_MRP'].min())

## 📈 5. Exploratory Data Analysis (EDA)

# Distribution of target variable
plt.figure(figsize=(8, 4))
sns.histplot(df['Item_Outlet_Sales'], kde=True)
plt.title('Distribution of Sales')
plt.show()

In [None]:
# 1. Distribution of Sales
plt.figure(figsize=(8, 4))
sns.histplot(df['Item_Outlet_Sales'], kde=True, color='blue')
plt.title('Distribution of Item Outlet Sales')
plt.xlabel('Sales')
plt.ylabel('Count')
plt.savefig('sales_distribution.png')  # Save chart for dashboard
plt.show()

# Sales by Outlet Type
plt.figure(figsize=(8, 4))
sns.boxplot(data=df, x='Outlet_Type', y='Item_Outlet_Sales')
plt.title('Sales by Outlet Type')
plt.xticks(rotation=45)
plt.show()

In [None]:
# 2. Sales by Outlet Type
plt.figure(figsize=(8, 4))
sns.boxplot(data=df, x='Outlet_Type', y='Item_Outlet_Sales', palette='Set2')
plt.title('Sales by Outlet Type')
plt.xlabel('Outlet Type')
plt.ylabel('Sales')
plt.xticks(rotation=45)
plt.savefig('sales_by_outlet_type.png')  # Save chart for dashboard
plt.show()

## 📐 6. Advanced Subsetting with NumPy & Conditions

# Filter high selling products
high_sales = df['Item_Outlet_Sales'] > df['Item_Outlet_Sales'].quantile(0.90)
df_high_sales = df[high_sales]
df_high_sales[['Item_Identifier', 'Item_Outlet_Sales']].head()

In [None]:
# Filter high-selling products (top 10% of sales)
high_sales = df['Item_Outlet_Sales'] > df['Item_Outlet_Sales'].quantile(0.90)
df_high_sales = df[high_sales]
print("Top 5 High-Selling Products:")
print(df_high_sales[['Item_Identifier', 'Item_Outlet_Sales']].head())

# Use NumPy to apply conditional logic
df['High_Price'] = np.where(df['Item_MRP'] > 200, 1, 0)
df[['Item_MRP', 'High_Price']].head()

In [None]:
# Create High_Price column (1 if Item_MRP > 200, else 0)
df['High_Price'] = np.where(df['Item_MRP'] > 200, 1, 0)
print("\nHigh Price Indicator (First 5 Rows):")
print(df[['Item_MRP', 'High_Price']].head())

## 📉 7. Correlation Heatmap

In [None]:
plt.figure(figsize=(10, 6))
corr = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

## 📌 Final Project Task

'''Create a business dashboard answering:
1. What product categories perform best?
2. Which outlet types generate more revenue?
3. Are high MRP products really driving more sales?

Deliverables:
- Python notebook with visualizations
- Insight summary (2 paragraphs)
- Exported charts/images if needed
'''


## --- Business Dashboard ---

In [None]:
# 1. Best Performing Product Categories
sales_by_type = df.groupby('Item_Type')['Item_Outlet_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(8, 5))
sales_by_type.head(5).plot(kind='bar', color='lightgreen')
plt.title('Top 5 Product Categories by Sales')
plt.xlabel('Product Category')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)
plt.savefig('top_product_categories.png')  # Save chart for dashboard
plt.show()

# 2. Revenue by Outlet Type
revenue_by_outlet = df.groupby('Outlet_Type')['Item_Outlet_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(8, 5))
revenue_by_outlet.plot(kind='bar', color='lightcoral')
plt.title('Revenue by Outlet Type')
plt.xlabel('Outlet Type')
plt.ylabel('Total Revenue')
plt.xticks(rotation=45)
plt.savefig('revenue_by_outlet_type.png')  # Save chart for dashboard
plt.show()

# 3. High MRP vs Sales
plt.figure(figsize=(8, 5))
plt.scatter(df['Item_MRP'], df['Item_Outlet_Sales'], c=df['High_Price'], cmap='viridis', alpha=0.5)
plt.title('Item MRP vs Sales (High Price Highlighted)')
plt.xlabel('Item MRP')
plt.ylabel('Sales')
plt.axvline(x=200, color='red', linestyle='--', label='High Price Threshold (200)')
plt.legend()
plt.savefig('mrp_vs_sales.png')  # Save chart for dashboard
plt.show()

# Business Dashboard Summary

The analysis reveals that product categories like Fruits and Vegetables, Snack Foods, and Household items generate the most sales, contributing significantly to total revenue. Supermarket Type1 and Type3 outlets outperform Grocery Stores, with Type3 showing the highest revenue due to larger sales volumes. This suggests focusing inventory and marketing efforts on these high-performing categories and outlet types to maximize profits. The sales distribution is skewed, with most products having low sales, indicating potential for optimizing stock for low-performing items.

High-MRP products (above 200) don’t consistently drive higher sales, as shown in the scatter plot, where many expensive items have low sales. The correlation heatmap confirms that Item_MRP has the strongest positive correlation with sales (around 0.5), while other features like Item_Weight show little impact. This suggests pricing strategies should be reviewed for high-MRP underperformers, and promotions could boost visibility for key categories. These insights can guide inventory management, pricing adjustments, and targeted marketing to enhance overall performance.