# 01 - Data Cleaning & Validation

## Purpose
Initial data quality assessment and cleaning for the Smart Inventory Manager.

## Sections
1. Load All CSVs
2. Data Quality Checks
3. Data Cleaning
4. Data Integrity Validation
5. Save Cleaned Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Data directory
DATA_DIR = Path('../..') / 'ml' / 'data' / 'processed'
print(f"Data directory: {DATA_DIR}")

## 1. Load All CSVs

In [None]:
# Load all CSV files
customers = pd.read_csv(DATA_DIR / 'customers.csv')
products = pd.read_csv(DATA_DIR / 'products.csv')
inventory = pd.read_csv(DATA_DIR / 'inventory.csv')
orders = pd.read_csv(DATA_DIR / 'orders.csv')
order_items = pd.read_csv(DATA_DIR / 'order_items.csv')
sellers = pd.read_csv(DATA_DIR / 'sellers.csv')

# Display dataset sizes
print("Dataset Sizes:")
print(f"  Customers:   {len(customers):,} rows")
print(f"  Products:    {len(products):,} rows")
print(f"  Inventory:   {len(inventory):,} rows")
print(f"  Orders:      {len(orders):,} rows")
print(f"  Order Items: {len(order_items):,} rows")
print(f"  Sellers:     {len(sellers):,} rows")

In [None]:
# Preview each dataset
print("\n=== CUSTOMERS ===")
display(customers.head())
print(customers.dtypes)

In [None]:
print("\n=== PRODUCTS ===")
display(products.head())
print(products.dtypes)

In [None]:
print("\n=== INVENTORY ===")
display(inventory.head())
print(inventory.dtypes)

In [None]:
print("\n=== ORDERS ===")
display(orders.head())
print(orders.dtypes)

In [None]:
print("\n=== ORDER ITEMS ===")
display(order_items.head())
print(order_items.dtypes)

## 2. Data Quality Checks

In [None]:
def check_data_quality(df, name):
    """Comprehensive data quality check for a DataFrame."""
    print(f"\n{'='*50}")
    print(f"Data Quality Report: {name}")
    print(f"{'='*50}")
    
    # Basic info
    print(f"\nRows: {len(df):,}")
    print(f"Columns: {len(df.columns)}")
    
    # Missing values
    missing = df.isnull().sum()
    missing_pct = (missing / len(df) * 100).round(2)
    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Missing %': missing_pct
    })
    missing_df = missing_df[missing_df['Missing Count'] > 0]
    
    if len(missing_df) > 0:
        print(f"\nMissing Values:")
        display(missing_df)
    else:
        print(f"\nNo missing values found!")
    
    # Duplicate rows
    duplicates = df.duplicated().sum()
    print(f"\nDuplicate Rows: {duplicates:,} ({duplicates/len(df)*100:.2f}%)")
    
    # Memory usage
    memory = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory Usage: {memory:.2f} MB")
    
    return missing_df

In [None]:
# Run quality checks on all datasets
quality_reports = {}
quality_reports['customers'] = check_data_quality(customers, 'Customers')
quality_reports['products'] = check_data_quality(products, 'Products')
quality_reports['inventory'] = check_data_quality(inventory, 'Inventory')
quality_reports['orders'] = check_data_quality(orders, 'Orders')
quality_reports['order_items'] = check_data_quality(order_items, 'Order Items')
quality_reports['sellers'] = check_data_quality(sellers, 'Sellers')

In [None]:
# Check for duplicate IDs
print("\n=== Duplicate ID Check ===")
print(f"Duplicate CustomerIDs: {customers['CustomerID'].duplicated().sum()}")
print(f"Duplicate ProductIDs: {products['ProductID'].duplicated().sum()}")
print(f"Duplicate OrderIDs: {orders['OrderID'].duplicated().sum()}")
print(f"Duplicate SellerIDs: {sellers['SellerID'].duplicated().sum()}")

### Outlier Detection

In [None]:
# Check for outliers in numeric columns
def detect_outliers(df, column):
    """Detect outliers using IQR method."""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower) | (df[column] > upper)]
    return len(outliers), lower, upper

print("\n=== Outlier Detection ===")

# Product prices
count, lower, upper = detect_outliers(products, 'Cost_Price')
print(f"Product Cost_Price outliers: {count} (range: {lower:.2f} - {upper:.2f})")

# Order amounts
count, lower, upper = detect_outliers(order_items, 'TotalAmount')
print(f"Order TotalAmount outliers: {count} (range: {lower:.2f} - {upper:.2f})")

# Quantities
count, lower, upper = detect_outliers(order_items, 'Quantity')
print(f"Order Quantity outliers: {count} (range: {lower:.2f} - {upper:.2f})")

In [None]:
# Visualize distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Product prices
axes[0, 0].hist(products['Cost_Price'], bins=50, edgecolor='black')
axes[0, 0].set_title('Product Cost Price Distribution')
axes[0, 0].set_xlabel('Cost Price')

# Order amounts
axes[0, 1].hist(order_items['TotalAmount'], bins=50, edgecolor='black')
axes[0, 1].set_title('Order Total Amount Distribution')
axes[0, 1].set_xlabel('Total Amount')

# Quantities
axes[1, 0].hist(order_items['Quantity'], bins=20, edgecolor='black')
axes[1, 0].set_title('Order Quantity Distribution')
axes[1, 0].set_xlabel('Quantity')

# Stock levels
axes[1, 1].hist(inventory['Current_Stock'], bins=50, edgecolor='black')
axes[1, 1].set_title('Current Stock Distribution')
axes[1, 1].set_xlabel('Stock Level')

plt.tight_layout()
plt.show()

## 3. Data Cleaning

In [None]:
# Clean customers
customers_clean = customers.copy()
customers_clean['CustomerName'] = customers_clean['CustomerName'].fillna('Unknown Customer')
customers_clean['Customer_Type'] = customers_clean['Customer_Type'].fillna('New')
customers_clean['Country'] = customers_clean['Country'].fillna('USA')

print(f"Customers cleaned: {len(customers_clean):,} rows")

In [None]:
# Clean products
products_clean = products.copy()
products_clean['ProductName'] = products_clean['ProductName'].fillna('Unknown Product')
products_clean['Category'] = products_clean['Category'].fillna('Uncategorized')
products_clean['Brand'] = products_clean['Brand'].fillna('Generic')

# Ensure positive prices
products_clean['Cost_Price'] = products_clean['Cost_Price'].clip(lower=0)

print(f"Products cleaned: {len(products_clean):,} rows")

In [None]:
# Clean inventory
inventory_clean = inventory.copy()

# Ensure non-negative stock levels
for col in ['Initial_Stock', 'Current_Stock', 'Reorder_Level', 'Restock_Quantity']:
    inventory_clean[col] = inventory_clean[col].fillna(0).astype(int).clip(lower=0)

print(f"Inventory cleaned: {len(inventory_clean):,} rows")

In [None]:
# Clean orders
orders_clean = orders.copy()
orders_clean['OrderDate'] = pd.to_datetime(orders_clean['OrderDate'])
orders_clean['Delivery_Date'] = pd.to_datetime(orders_clean['Delivery_Date'], errors='coerce')

print(f"Orders cleaned: {len(orders_clean):,} rows")
print(f"Date range: {orders_clean['OrderDate'].min()} to {orders_clean['OrderDate'].max()}")

In [None]:
# Clean order items
order_items_clean = order_items.copy()

# Ensure positive quantities
order_items_clean['Quantity'] = order_items_clean['Quantity'].clip(lower=1)

# Fill missing values
order_items_clean['Discount'] = order_items_clean['Discount'].fillna(0)
order_items_clean['Tax'] = order_items_clean['Tax'].fillna(0)
order_items_clean['ShippingCost'] = order_items_clean['ShippingCost'].fillna(0)
order_items_clean['Profit'] = order_items_clean['Profit'].fillna(0)

print(f"Order items cleaned: {len(order_items_clean):,} rows")

## 4. Data Integrity Validation

In [None]:
# Check foreign key relationships
print("\n=== Foreign Key Validation ===")

# Orders -> Customers
order_customers = set(orders_clean['CustomerID'].unique())
valid_customers = set(customers_clean['CustomerID'].unique())
invalid_customer_refs = order_customers - valid_customers
print(f"Invalid CustomerID references in Orders: {len(invalid_customer_refs)}")

# Order Items -> Products
order_products = set(order_items_clean['ProductID'].unique())
valid_products = set(products_clean['ProductID'].unique())
invalid_product_refs = order_products - valid_products
print(f"Invalid ProductID references in Order Items: {len(invalid_product_refs)}")

# Order Items -> Sellers
order_sellers = set(order_items_clean['SellerID'].unique())
valid_sellers = set(sellers['SellerID'].unique())
invalid_seller_refs = order_sellers - valid_sellers
print(f"Invalid SellerID references in Order Items: {len(invalid_seller_refs)}")

# Inventory -> Products
inventory_products = set(inventory_clean['ProductID'].unique())
invalid_inventory_refs = inventory_products - valid_products
print(f"Invalid ProductID references in Inventory: {len(invalid_inventory_refs)}")

In [None]:
# Validate business rules
print("\n=== Business Rule Validation ===")

# Check for negative values
negative_quantities = (order_items_clean['Quantity'] <= 0).sum()
print(f"Orders with non-positive quantity: {negative_quantities}")

negative_prices = (order_items_clean['UnitPrice'] < 0).sum()
print(f"Orders with negative unit price: {negative_prices}")

negative_stock = (inventory_clean['Current_Stock'] < 0).sum()
print(f"Products with negative stock: {negative_stock}")

# Check total amount calculations
calculated_total = order_items_clean['Quantity'] * order_items_clean['UnitPrice'] - order_items_clean['Discount'] + order_items_clean['Tax'] + order_items_clean['ShippingCost']
diff = abs(calculated_total - order_items_clean['TotalAmount'])
large_diff = (diff > 1).sum()  # Allow for rounding errors
print(f"Orders with total amount discrepancy > $1: {large_diff}")

## 5. Summary Statistics

In [None]:
# Generate summary statistics
print("\n" + "="*60)
print("DATA QUALITY SUMMARY")
print("="*60)

print(f"\nDataset Sizes (after cleaning):")
print(f"  Customers:   {len(customers_clean):,}")
print(f"  Products:    {len(products_clean):,}")
print(f"  Inventory:   {len(inventory_clean):,}")
print(f"  Orders:      {len(orders_clean):,}")
print(f"  Order Items: {len(order_items_clean):,}")
print(f"  Sellers:     {len(sellers):,}")

print(f"\nDate Range:")
print(f"  First Order: {orders_clean['OrderDate'].min()}")
print(f"  Last Order:  {orders_clean['OrderDate'].max()}")

print(f"\nKey Metrics:")
print(f"  Total Revenue: ${order_items_clean['TotalAmount'].sum():,.2f}")
print(f"  Total Profit:  ${order_items_clean['Profit'].sum():,.2f}")
print(f"  Avg Order Value: ${order_items_clean['TotalAmount'].mean():,.2f}")
print(f"  Unique Categories: {products_clean['Category'].nunique()}")

In [None]:
# Save cleaned data (optional - data is already loaded in database)
# Uncomment to save cleaned CSVs

# output_dir = Path('../data/cleaned')
# output_dir.mkdir(exist_ok=True)
# 
# customers_clean.to_csv(output_dir / 'customers_clean.csv', index=False)
# products_clean.to_csv(output_dir / 'products_clean.csv', index=False)
# inventory_clean.to_csv(output_dir / 'inventory_clean.csv', index=False)
# orders_clean.to_csv(output_dir / 'orders_clean.csv', index=False)
# order_items_clean.to_csv(output_dir / 'order_items_clean.csv', index=False)
# 
# print("Cleaned data saved to", output_dir)