# 02 - Exploratory Data Analysis (EDA)

## Purpose
Understand data distributions, patterns, and relationships in the Smart Inventory Manager dataset.

## Sections
1. Dataset Overview
2. Univariate Analysis
3. Bivariate Analysis
4. Time-Series Analysis
5. Geographic Analysis
6. Key Statistics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Load data
DATA_DIR = Path('../..') / 'ml' / 'data' / 'processed'

customers = pd.read_csv(DATA_DIR / 'customers.csv')
products = pd.read_csv(DATA_DIR / 'products.csv')
inventory = pd.read_csv(DATA_DIR / 'inventory.csv')
orders = pd.read_csv(DATA_DIR / 'orders.csv')
order_items = pd.read_csv(DATA_DIR / 'order_items.csv')
sellers = pd.read_csv(DATA_DIR / 'sellers.csv')

# Parse dates
orders['OrderDate'] = pd.to_datetime(orders['OrderDate'])

# Merge for analysis
full_orders = orders.merge(order_items, on='OrderID')
full_orders = full_orders.merge(products, on='ProductID')

print(f"Full orders dataset: {len(full_orders):,} rows")

## 1. Dataset Overview

In [None]:
print("="*60)
print("DATASET OVERVIEW")
print("="*60)

print(f"\nTotal Records:")
print(f"  Orders: {len(orders):,}")
print(f"  Unique Customers: {orders['CustomerID'].nunique():,}")
print(f"  Unique Products: {order_items['ProductID'].nunique():,}")
print(f"  Unique Sellers: {order_items['SellerID'].nunique():,}")

print(f"\nDate Coverage:")
print(f"  Start: {orders['OrderDate'].min().strftime('%Y-%m-%d')}")
print(f"  End: {orders['OrderDate'].max().strftime('%Y-%m-%d')}")
print(f"  Duration: {(orders['OrderDate'].max() - orders['OrderDate'].min()).days} days")

print(f"\nProduct Categories: {products['Category'].nunique()}")
print(products['Category'].value_counts())

## 2. Univariate Analysis

In [None]:
# Order value distribution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Order total amount
axes[0, 0].hist(order_items['TotalAmount'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].axvline(order_items['TotalAmount'].mean(), color='red', linestyle='--', label=f'Mean: ${order_items["TotalAmount"].mean():.2f}')
axes[0, 0].axvline(order_items['TotalAmount'].median(), color='green', linestyle='--', label=f'Median: ${order_items["TotalAmount"].median():.2f}')
axes[0, 0].set_title('Order Value Distribution')
axes[0, 0].set_xlabel('Total Amount ($)')
axes[0, 0].legend()

# Quantity distribution
axes[0, 1].hist(order_items['Quantity'], bins=range(1, 15), edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Order Quantity Distribution')
axes[0, 1].set_xlabel('Quantity')

# Product price distribution
axes[1, 0].hist(products['Cost_Price'], bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].set_title('Product Price Distribution')
axes[1, 0].set_xlabel('Cost Price ($)')

# Profit margin distribution
axes[1, 1].hist(order_items['Profit_Margin'], bins=50, edgecolor='black', alpha=0.7)
axes[1, 1].set_title('Profit Margin Distribution')
axes[1, 1].set_xlabel('Profit Margin (%)')

plt.tight_layout()
plt.show()

In [None]:
# Category distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Products per category
category_counts = products['Category'].value_counts()
axes[0].barh(category_counts.index, category_counts.values)
axes[0].set_title('Products per Category')
axes[0].set_xlabel('Number of Products')

# Orders per category
order_category = full_orders['Category'].value_counts()
axes[1].barh(order_category.index, order_category.values)
axes[1].set_title('Orders per Category')
axes[1].set_xlabel('Number of Orders')

plt.tight_layout()
plt.show()

In [None]:
# Order status distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

status_counts = orders['OrderStatus'].value_counts()
axes[0].pie(status_counts, labels=status_counts.index, autopct='%1.1f%%', startangle=90)
axes[0].set_title('Order Status Distribution')

payment_counts = orders['PaymentMethod'].value_counts()
axes[1].pie(payment_counts, labels=payment_counts.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Payment Method Distribution')

plt.tight_layout()
plt.show()

## 3. Bivariate Analysis

In [None]:
# Revenue by category
category_revenue = full_orders.groupby('Category').agg({
    'TotalAmount': 'sum',
    'Profit': 'sum',
    'OrderID': 'count',
    'Quantity': 'sum'
}).rename(columns={'OrderID': 'OrderCount'})

category_revenue = category_revenue.sort_values('TotalAmount', ascending=True)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Revenue by category
axes[0].barh(category_revenue.index, category_revenue['TotalAmount'] / 1000)
axes[0].set_title('Revenue by Category')
axes[0].set_xlabel('Revenue ($K)')

# Profit by category
axes[1].barh(category_revenue.index, category_revenue['Profit'] / 1000)
axes[1].set_title('Profit by Category')
axes[1].set_xlabel('Profit ($K)')

plt.tight_layout()
plt.show()

display(category_revenue.sort_values('TotalAmount', ascending=False))

In [None]:
# Correlation analysis
numeric_cols = ['Quantity', 'UnitPrice', 'Discount', 'Tax', 'TotalAmount', 'Profit', 'Profit_Margin']
correlation_matrix = order_items[numeric_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix - Order Metrics')
plt.tight_layout()
plt.show()

In [None]:
# Price vs Quantity relationship
plt.figure(figsize=(10, 6))
plt.scatter(order_items['UnitPrice'], order_items['Quantity'], alpha=0.3)
plt.xlabel('Unit Price ($)')
plt.ylabel('Quantity')
plt.title('Price vs Quantity Relationship')
plt.show()

## 4. Time-Series Analysis

In [None]:
# Daily orders and revenue
daily_stats = full_orders.groupby(full_orders['OrderDate'].dt.date).agg({
    'OrderID': 'nunique',
    'TotalAmount': 'sum',
    'Profit': 'sum'
}).reset_index()
daily_stats.columns = ['Date', 'Orders', 'Revenue', 'Profit']
daily_stats['Date'] = pd.to_datetime(daily_stats['Date'])

fig, axes = plt.subplots(3, 1, figsize=(14, 12))

# Daily orders
axes[0].plot(daily_stats['Date'], daily_stats['Orders'], alpha=0.7)
axes[0].plot(daily_stats['Date'], daily_stats['Orders'].rolling(7).mean(), color='red', linewidth=2, label='7-day MA')
axes[0].set_title('Daily Orders Over Time')
axes[0].set_ylabel('Number of Orders')
axes[0].legend()

# Daily revenue
axes[1].plot(daily_stats['Date'], daily_stats['Revenue'] / 1000, alpha=0.7)
axes[1].plot(daily_stats['Date'], (daily_stats['Revenue'] / 1000).rolling(7).mean(), color='red', linewidth=2, label='7-day MA')
axes[1].set_title('Daily Revenue Over Time')
axes[1].set_ylabel('Revenue ($K)')
axes[1].legend()

# Daily profit
axes[2].plot(daily_stats['Date'], daily_stats['Profit'] / 1000, alpha=0.7)
axes[2].plot(daily_stats['Date'], (daily_stats['Profit'] / 1000).rolling(7).mean(), color='red', linewidth=2, label='7-day MA')
axes[2].set_title('Daily Profit Over Time')
axes[2].set_ylabel('Profit ($K)')
axes[2].legend()

plt.tight_layout()
plt.show()

In [None]:
# Monthly trends
full_orders['YearMonth'] = full_orders['OrderDate'].dt.to_period('M')

monthly_stats = full_orders.groupby('YearMonth').agg({
    'OrderID': 'nunique',
    'TotalAmount': 'sum',
    'Profit': 'sum',
    'CustomerID': 'nunique'
}).reset_index()
monthly_stats.columns = ['Month', 'Orders', 'Revenue', 'Profit', 'Customers']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

x = range(len(monthly_stats))
labels = [str(m) for m in monthly_stats['Month']]

axes[0, 0].bar(x, monthly_stats['Orders'])
axes[0, 0].set_xticks(x[::3])
axes[0, 0].set_xticklabels(labels[::3], rotation=45)
axes[0, 0].set_title('Monthly Orders')

axes[0, 1].bar(x, monthly_stats['Revenue'] / 1000)
axes[0, 1].set_xticks(x[::3])
axes[0, 1].set_xticklabels(labels[::3], rotation=45)
axes[0, 1].set_title('Monthly Revenue ($K)')

axes[1, 0].bar(x, monthly_stats['Profit'] / 1000)
axes[1, 0].set_xticks(x[::3])
axes[1, 0].set_xticklabels(labels[::3], rotation=45)
axes[1, 0].set_title('Monthly Profit ($K)')

axes[1, 1].bar(x, monthly_stats['Customers'])
axes[1, 1].set_xticks(x[::3])
axes[1, 1].set_xticklabels(labels[::3], rotation=45)
axes[1, 1].set_title('Monthly Active Customers')

plt.tight_layout()
plt.show()

In [None]:
# Day of week patterns
full_orders['DayOfWeek'] = full_orders['OrderDate'].dt.day_name()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

dow_stats = full_orders.groupby('DayOfWeek').agg({
    'OrderID': 'count',
    'TotalAmount': 'mean'
}).reindex(day_order)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].bar(dow_stats.index, dow_stats['OrderID'])
axes[0].set_title('Orders by Day of Week')
axes[0].tick_params(axis='x', rotation=45)

axes[1].bar(dow_stats.index, dow_stats['TotalAmount'])
axes[1].set_title('Average Order Value by Day of Week')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 5. Geographic Analysis

In [None]:
# Orders by state
state_stats = customers.groupby('State').size().sort_values(ascending=False).head(15)

plt.figure(figsize=(12, 6))
plt.barh(state_stats.index[::-1], state_stats.values[::-1])
plt.title('Customers by State (Top 15)')
plt.xlabel('Number of Customers')
plt.tight_layout()
plt.show()

In [None]:
# Customer type distribution
customer_types = customers['Customer_Type'].value_counts()

plt.figure(figsize=(8, 6))
plt.pie(customer_types, labels=customer_types.index, autopct='%1.1f%%', startangle=90)
plt.title('Customer Type Distribution')
plt.show()

## 6. Key Statistics Summary

In [None]:
print("\n" + "="*60)
print("KEY STATISTICS SUMMARY")
print("="*60)

print(f"\n=== Revenue Metrics ===")
print(f"Total Revenue: ${order_items['TotalAmount'].sum():,.2f}")
print(f"Total Profit: ${order_items['Profit'].sum():,.2f}")
print(f"Average Profit Margin: {order_items['Profit_Margin'].mean():.2f}%")

print(f"\n=== Order Metrics ===")
print(f"Total Orders: {orders['OrderID'].nunique():,}")
print(f"Average Order Value: ${order_items['TotalAmount'].mean():,.2f}")
print(f"Median Order Value: ${order_items['TotalAmount'].median():,.2f}")
print(f"Average Items per Order: {order_items['Quantity'].mean():.2f}")

print(f"\n=== Customer Metrics ===")
print(f"Total Customers: {customers['CustomerID'].nunique():,}")
print(f"Orders per Customer: {len(orders) / customers['CustomerID'].nunique():.2f}")

print(f"\n=== Product Metrics ===")
print(f"Total Products: {products['ProductID'].nunique():,}")
print(f"Categories: {products['Category'].nunique()}")
print(f"Brands: {products['Brand'].nunique()}")
print(f"Average Product Price: ${products['Cost_Price'].mean():,.2f}")

print(f"\n=== Top Performing Categories (by Revenue) ===")
top_categories = category_revenue.sort_values('TotalAmount', ascending=False).head(5)
for cat, row in top_categories.iterrows():
    print(f"  {cat}: ${row['TotalAmount']:,.2f}")