# Data Overview and Initial Exploration

This notebook provides an initial exploration of the Amazon orders dataset for the Smart Inventory Manager project.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

# Configure pandas display
pd.set_options('display.max_columns', None)
pd.set_options('display.width', None)

In [None]:
# Load the dataset
data_path = Path('../data/raw/amazon_orders.csv')

if data_path.exists():
    df = pd.read_csv(data_path)
    print(f"Dataset loaded successfully with {len(df)} rows and {len(df.columns)} columns")
else:
    print(f"Dataset not found at {data_path}")
    print("Please ensure the amazon_orders.csv file is placed in the data/raw directory")
    # Create sample data for demonstration
    np.random.seed(42)
    n_rows = 1000

    df = pd.DataFrame({
        'OrderID': [f'ORD{i:06d}' for i in range(n_rows)],
        'OrderDate': pd.date_range('2023-01-01', periods=n_rows, freq='1H'),
        'CustomerID': [f'CUST{i%100:04d}' for i in range(n_rows)],
        'CustomerName': [f'Customer {i%100}' for i in range(n_rows)],
        'ProductID': [f'PROD{i%50:04d}' for i in range(n_rows)],
        'ProductName': [f'Product {i%50}' for i in range(n_rows)],
        'Category': np.random.choice(['Electronics', 'Books', 'Clothing', 'Home', 'Sports'], n_rows),
        'Brand': np.random.choice(['BrandA', 'BrandB', 'BrandC', 'BrandD'], n_rows),
        'Quantity': np.random.randint(1, 5, n_rows),
        'UnitPrice': np.random.uniform(10, 500, n_rows).round(2),
        'Discount': np.random.uniform(0, 50, n_rows).round(2),
        'Tax': np.random.uniform(0, 20, n_rows).round(2),
        'ShippingCost': np.random.uniform(0, 15, n_rows).round(2),
        'TotalAmount': np.random.uniform(20, 600, n_rows).round(2),
        'PaymentMethod': np.random.choice(['Credit Card', 'Debit Card', 'PayPal', 'Cash'], n_rows),
        'OrderStatus': np.random.choice(['Completed', 'Pending', 'Cancelled'], n_rows, p=[0.8, 0.15, 0.05]),
        'City': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], n_rows),
        'State': np.random.choice(['NY', 'CA', 'IL', 'TX', 'AZ'], n_rows),
        'Country': ['USA'] * n_rows,
        'SellerID': [f'SELL{i%10:03d}' for i in range(n_rows)]
    })

print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Display basic information
print("=== Dataset Information ===")
print(df.info())

print("\n=== First 5 rows ===")
display(df.head())

print("\n=== Data Types ===")
print(df.dtypes)

print("\n=== Summary Statistics ===")
display(df.describe())

In [None]:
# Check for missing values
print("=== Missing Values Analysis ===")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_summary = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
}).sort_values('Missing Count', ascending=False)

print(missing_summary[missing_summary['Missing Count'] > 0])

# Visualize missing values
plt.figure(figsize=(12, 6))
missing_data[missing_data > 0].plot(kind='bar', color='skyblue')
plt.title('Missing Values by Column')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Data visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Sales by category
category_sales = df.groupby('Category')['TotalAmount'].sum().sort_values(ascending=False)
axes[0, 0].bar(category_sales.index, category_sales.values, color='lightcoral')
axes[0, 0].set_title('Total Sales by Category')
axes[0, 0].set_ylabel('Total Amount ($)')
axes[0, 0].tick_params(axis='x', rotation=45)

# Orders by status
status_counts = df['OrderStatus'].value_counts()
axes[0, 1].pie(status_counts.values, labels=status_counts.index, autopct='%1.1f%%')
axes[0, 1].set_title('Order Status Distribution')

# Top products by quantity sold
product_quantity = df.groupby('ProductName')['Quantity'].sum().sort_values(ascending=False).head(10)
axes[1, 0].barh(product_quantity.index, product_quantity.values, color='lightgreen')
axes[1, 0].set_title('Top 10 Products by Quantity Sold')
axes[1, 0].set_xlabel('Quantity Sold')

# Payment method distribution
payment_counts = df['PaymentMethod'].value_counts()
axes[1, 1].bar(payment_counts.index, payment_counts.values, color='lightblue')
axes[1, 1].set_title('Payment Method Distribution')
axes[1, 1].set_ylabel('Number of Orders')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Time series analysis
df['OrderDate'] = pd.to_datetime(df['OrderDate'])
df['Date'] = df['OrderDate'].dt.date

# Daily sales trend
daily_sales = df.groupby('Date')['TotalAmount'].sum().reset_index()

plt.figure(figsize=(15, 6))
plt.plot(daily_sales['Date'], daily_sales['TotalAmount'], marker='o', linestyle='-', alpha=0.7)
plt.title('Daily Sales Trend')
plt.xlabel('Date')
plt.ylabel('Total Sales Amount ($)')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Sales by hour of day
df['Hour'] = df['OrderDate'].dt.hour
hourly_sales = df.groupby('Hour')['TotalAmount'].sum()

plt.figure(figsize=(12, 6))
plt.bar(hourly_sales.index, hourly_sales.values, color='orange', alpha=0.7)
plt.title('Sales by Hour of Day')
plt.xlabel('Hour')
plt.ylabel('Total Sales Amount ($)')
plt.xticks(range(24))
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
numeric_columns = ['Quantity', 'UnitPrice', 'Discount', 'Tax', 'ShippingCost', 'TotalAmount']
correlation_matrix = df[numeric_columns].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation Matrix of Numeric Variables')
plt.tight_layout()
plt.show()

# Key insights
print("=== Key Insights ===")
print(f"Total number of orders: {len(df)}")
print(f"Total revenue: ${df['TotalAmount'].sum():,.2f}")
print(f"Average order value: ${df['TotalAmount'].mean():.2f}")
print(f"Most popular category: {df['Category'].mode().iloc[0]}")
print(f"Most common payment method: {df['PaymentMethod'].mode().iloc[0]}")
print(f"Order completion rate: {(df['OrderStatus'] == 'Completed').mean()*100:.1f}%")

# Save processed data
processed_path = Path('../data/processed/amazon_orders_processed.csv')
df.to_csv(processed_path, index=False)
print(f"\nProcessed data saved to: {processed_path}")