# Supermarket Sales Analysis (Advanced Python Project)
Put the dataset CSV at: `SupermarketSalesAnalysis/data/supermarket_sales.csv` ثم شغّل الخلايا.


In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
ROOT = Path('.').resolve().parents[0]
DATA = ROOT / 'data' / 'supermarket_sales.csv'
df = pd.read_csv(DATA)
df.head()

In [None]:
df.info()

In [None]:
df.describe(include='all')

## Cleaning


In [None]:
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
for col in df.columns:
    if df[col].isna().any():
        if pd.api.types.is_numeric_dtype(df[col]):
            df[col] = df[col].fillna(df[col].median())
        else:
            mode = df[col].mode(dropna=True)
            df[col] = df[col].fillna(mode.iloc[0] if len(mode) else 'Unknown')
df.isna().sum().head(15)

## Frequency bars (Branch, Customer type, Gender, Payment)


In [None]:
for col in ['Branch','Customer type','Gender','Payment']:
    if col in df.columns:
        plt.figure()
        df[col].value_counts().plot(kind='bar')
        plt.title(f'Frequency of {col}')
        plt.tight_layout()
        plt.show()

## Sales over time + Rating trend


In [None]:
if 'Date' in df.columns and 'Total' in df.columns:
    daily = df.groupby('Date')['Total'].sum().reset_index().sort_values('Date')
    plt.figure(); plt.plot(daily['Date'], daily['Total'])
    plt.xticks(rotation=45, ha='right'); plt.title('Sales Over Time')
    plt.tight_layout(); plt.show()

if 'Date' in df.columns and 'Rating' in df.columns:
    daily_r = df.groupby('Date')['Rating'].mean().reset_index().sort_values('Date')
    plt.figure(); plt.plot(daily_r['Date'], daily_r['Rating'])
    plt.xticks(rotation=45, ha='right'); plt.title('Rating Trend')
    plt.tight_layout(); plt.show()

## Scatter + Heatmap + Boxplot


In [None]:
if 'Total' in df.columns and 'Rating' in df.columns:
    plt.figure(); sns.scatterplot(x=df['Total'], y=df['Rating'])
    plt.title('Sales vs Rating'); plt.tight_layout(); plt.show()

num = df.select_dtypes(include=[np.number])
corr = num.corr(numeric_only=True)
plt.figure(figsize=(8,6)); sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap'); plt.tight_layout(); plt.show()

if 'gross income' in df.columns and 'Product line' in df.columns:
    plt.figure(figsize=(10,6)); sns.boxplot(x='Product line', y='gross income', data=df)
    plt.xticks(rotation=25, ha='right'); plt.title('Gross Income by Product Line')
    plt.tight_layout(); plt.show()

## Advanced Questions (answers)


In [None]:
answers = {}
if 'Branch' in df.columns and 'Total' in df.columns:
    br = df.groupby('Branch')['Total'].sum().sort_values(ascending=False)
    answers['Q1'] = f"Highest revenue branch: {br.index[0]} (Total={br.iloc[0]:.2f})"
if 'Customer type' in df.columns and 'Total' in df.columns:
    sp = df.groupby('Customer type')['Total'].mean().sort_values(ascending=False)
    answers['Q2'] = f"Avg spend: {sp.to_dict()}"
if 'Payment' in df.columns:
    pay = df['Payment'].value_counts()
    answers['Q3'] = f"Most used payment: {pay.index[0]} (Count={pay.iloc[0]})"
if 'Product line' in df.columns and 'Rating' in df.columns:
    pr = df.groupby('Product line')['Rating'].mean().sort_values(ascending=False)
    answers['Q4'] = f"Highest avg rating product line: {pr.index[0]} (AvgRating={pr.iloc[0]:.2f})"
if 'Unit price' in df.columns and 'Quantity' in df.columns:
    cq = df[['Unit price','Quantity']].corr().iloc[0,1]
    answers['Q5'] = f"Correlation(Unit price, Quantity) = {cq:.3f}"
answers