# E-commerce Sales Prediction - Exploratory Data Analysis

This notebook contains comprehensive exploratory data analysis for the e-commerce sales prediction project. We'll analyze various aspects of the data to understand patterns and relationships that can help in predicting weekly sales.

## Setup and Data Loading

In [None]:
# Import required libraries
import findspark
findspark.init()

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, when, count, mean, stddev
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Import project utilities
import sys
sys.path.append('../src')
from utils import plot_missing_values, plot_correlation_matrix, detect_outliers
from config import NUMERICAL_FEATURES, CATEGORICAL_FEATURES, BINARY_FEATURES

# Set up plotting style
plt.style.use('seaborn')
sns.set_palette('husl')

In [None]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("SalesPrediction_EDA") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Load the dataset
df = spark.read.csv('../data/raw/sales_data.csv', header=True, inferSchema=True)

# Display basic information
print("Dataset Overview:")
print(f"Number of records: {df.count():,}")
print(f"Number of features: {len(df.columns)}")
print("\nSchema:")
df.printSchema()

## 1. Missing Data Analysis

In [None]:
# Analyze missing values
plot_missing_values(df)

# Get detailed missing value statistics
missing_stats = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).toPandas()
missing_stats = missing_stats.T.reset_index()
missing_stats.columns = ['Column', 'Missing Count']
missing_stats['Missing Percentage'] = (missing_stats['Missing Count'] / df.count()) * 100
missing_stats = missing_stats.sort_values('Missing Percentage', ascending=False)

print("\nDetailed Missing Value Analysis:")
print(missing_stats[missing_stats['Missing Percentage'] > 0])

## 2. Descriptive Statistics

In [None]:
# Calculate descriptive statistics for numerical features
numeric_stats = df.select(NUMERICAL_FEATURES).describe().toPandas()
print("Numerical Features Statistics:")
display(numeric_stats)

# Display distribution plots for numerical features
numerical_data = df.select(NUMERICAL_FEATURES).toPandas()

fig, axes = plt.subplots(3, 3, figsize=(15, 15))
axes = axes.ravel()

for idx, col in enumerate(NUMERICAL_FEATURES):
    if idx < len(axes):
        sns.histplot(data=numerical_data, x=col, ax=axes[idx])
        axes[idx].set_title(f'Distribution of {col}')

plt.tight_layout()
plt.show()

## 3. Sales Analysis

In [None]:
# Analyze relationship between sales, stock, and price
sales_analysis = df.select('sales_count_week', 'stock', 'price', 'primaryPrice').toPandas()

fig, axes = plt.subplots(2, 2, figsize=(15, 15))

# Sales vs Stock
sns.scatterplot(data=sales_analysis, x='stock', y='sales_count_week', ax=axes[0,0], alpha=0.5)
axes[0,0].set_title('Sales vs Stock')

# Sales vs Price
sns.scatterplot(data=sales_analysis, x='price', y='sales_count_week', ax=axes[0,1], alpha=0.5)
axes[0,1].set_title('Sales vs Price')

# Sales vs Primary Price
sns.scatterplot(data=sales_analysis, x='primaryPrice', y='sales_count_week', ax=axes[1,0], alpha=0.5)
axes[1,0].set_title('Sales vs Primary Price')

# Price Discount Analysis
sales_analysis['discount_percentage'] = ((sales_analysis['primaryPrice'] - sales_analysis['price']) 
                                        / sales_analysis['primaryPrice'] * 100)
sns.scatterplot(data=sales_analysis, x='discount_percentage', y='sales_count_week', ax=axes[1,1], alpha=0.5)
axes[1,1].set_title('Sales vs Discount Percentage')

plt.tight_layout()
plt.show()

## 4. Category Analysis

In [None]:
# Analyze sales by category
category_analysis = df.groupBy('categoryTitle').agg(
    F.avg('sales_count_week').alias('avg_sales'),
    F.count('*').alias('product_count'),
    F.avg('price').alias('avg_price'),
    F.avg('rating_average').alias('avg_rating')
).toPandas()

# Plot top categories by average sales
plt.figure(figsize=(15, 6))
top_categories = category_analysis.nlargest(10, 'avg_sales')
sns.barplot(data=top_categories, x='categoryTitle', y='avg_sales')
plt.xticks(rotation=45, ha='right')
plt.title('Average Weekly Sales by Top 10 Categories')
plt.tight_layout()
plt.show()

# Display category statistics
print("\nCategory Statistics:")
display(category_analysis.sort_values('avg_sales', ascending=False).head(10))

## 5. Shipping and Delivery Analysis

In [None]:
# Analyze impact of free shipping and delivery options
shipping_features = ['isFreeShipping', 'has_delivery', 
                    'vendor_freeShippingToIran', 'vendor_freeShippingToSameCity']

fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, feature in enumerate(shipping_features):
    shipping_stats = df.groupBy(feature) \
        .agg(F.avg('sales_count_week').alias('avg_sales'),
             F.count('*').alias('count')) \
        .toPandas()
    
    sns.barplot(data=shipping_stats, x=feature, y='avg_sales', ax=axes[idx])
    axes[idx].set_title(f'Average Sales by {feature}')

plt.tight_layout()
plt.show()

## 6. Vendor Analysis

In [None]:
# Analyze vendor performance
vendor_analysis = df.groupBy('vendor_name') \
    .agg(F.avg('sales_count_week').alias('avg_sales'),
         F.avg('rating_average').alias('avg_rating'),
         F.count('*').alias('product_count')) \
    .filter('product_count >= 10') \
    .toPandas()

# Plot vendor performance
plt.figure(figsize=(12, 8))
plt.scatter(vendor_analysis['avg_rating'], 
           vendor_analysis['avg_sales'],
           s=vendor_analysis['product_count'],
           alpha=0.6)
plt.xlabel('Average Rating')
plt.ylabel('Average Weekly Sales')
plt.title('Vendor Performance: Sales vs Ratings (size = product count)')

# Add annotations for top performers
top_vendors = vendor_analysis.nlargest(5, 'avg_sales')
for _, vendor in top_vendors.iterrows():
    plt.annotate(vendor['vendor_name'],
                 (vendor['avg_rating'], vendor['avg_sales']))

plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:
# Calculate and plot correlation matrix
plot_correlation_matrix(df, NUMERICAL_FEATURES)

# Calculate detailed correlations with sales
correlations = []
for feature in NUMERICAL_FEATURES:
    correlation = df.stat.corr('sales_count_week', feature)
    correlations.append({
        'feature': feature,
        'correlation': correlation
    })

correlations_df = pd.DataFrame(correlations)
correlations_df = correlations_df.sort_values('correlation', key=abs, ascending=False)

print("\nCorrelations with sales_count_week:")
display(correlations_df)

## 8. Outlier Detection

In [None]:
# Create box plots for numerical features
plt.figure(figsize=(15, 6))
numerical_data.boxplot(column=NUMERICAL_FEATURES, figsize=(15, 6))
plt.xticks(rotation=45)
plt.title('Box Plots of Numerical Features')
plt.tight_layout()
plt.show()

# Detect outliers using Z-score method
outlier_stats = {}
for feature in NUMERICAL_FEATURES:
    df_with_outliers = detect_outliers(df, feature)
    outlier_count = df_with_outliers.filter(col(f'{feature}_is_outlier')).count()
    outlier_percentage = (outlier_count / df.count()) * 100
    outlier_stats[feature] = {
        'outlier_count': outlier_count,
        'outlier_percentage': outlier_percentage
    }

print("\nOutlier Statistics (|z-score| > 3):")
outlier_df = pd.DataFrame.from_dict(outlier_stats, orient='index')
display(outlier_df)

## 9. Key Insights Summary

Based on the analysis above, here are the key insights:

1. **Missing Data**:
   - [Will be filled based on actual analysis]

2. **Sales Patterns**:
   - Relationship between sales and stock levels
   - Impact of pricing on sales
   - Effect of discounts

3. **Category Performance**:
   - Top performing categories
   - Category-wise pricing strategies

4. **Shipping Impact**:
   - Effect of free shipping on sales
   - Delivery options influence

5. **Vendor Analysis**:
   - Top performing vendors
   - Relationship between ratings and sales

6. **Correlations**:
   - Strong predictors of sales
   - Feature relationships

7. **Outliers**:
   - Distribution of extreme values
   - Impact on modeling strategy