# Advanced Customer and Sales Analysis

This notebook provides deep insights into:
1. Customer Segmentation and Behavior
2. Sales Patterns
3. Geographic Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from scipy import stats

# Set plotting style
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = [12, 6]
sns.set_palette('husl')

# Read the cleaned data
df = pd.read_csv('../data/cleaned_sales_data.csv')

# Convert date columns
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Ship Date'] = pd.to_datetime(df['Ship Date'])

# Print available columns
print("Available columns in dataset:")
print(df.columns.tolist())

## 1. Customer Analysis

In [None]:
# Customer purchase frequency
customer_frequency = df.groupby('Customer ID').agg({
    'Order ID': 'count',
    'Sales': ['sum', 'mean']
}).round(2)

# Add RFM scores
current_date = df['Order Date'].max()
rfm = df.groupby('Customer ID').agg({
    'Order Date': lambda x: (current_date - x.max()).days,  # Recency
    'Order ID': 'count',  # Frequency
    'Sales': 'sum'  # Monetary
}).round(2)

# Segment customers
rfm.columns = ['Recency', 'Frequency', 'Monetary']

# Handle potential duplicate values in quantile calculation
rfm['R_Score'] = pd.qcut(rfm['Recency'], q=5, labels=[5, 4, 3, 2, 1], duplicates='drop')
rfm['F_Score'] = pd.qcut(rfm['Frequency'], q=5, labels=[1, 2, 3, 4, 5], duplicates='drop')
rfm['M_Score'] = pd.qcut(rfm['Monetary'], q=5, labels=[1, 2, 3, 4, 5], duplicates='drop')

# Calculate RFM Score
rfm['RFM_Score'] = rfm['R_Score'].astype(str) + rfm['F_Score'].astype(str) + rfm['M_Score'].astype(str)

# Add customer segments
def segment_customers(row):
    if row['R_Score'] >= 4 and row['F_Score'] >= 4 and row['M_Score'] >= 4:
        return 'Top Customers'
    elif row['R_Score'] >= 3 and row['F_Score'] >= 3 and row['M_Score'] >= 3:
        return 'High Value'
    elif row['R_Score'] >= 2 and row['F_Score'] >= 2 and row['M_Score'] >= 2:
        return 'Medium Value'
    else:
        return 'Low Value'

rfm['Customer_Segment'] = rfm.apply(segment_customers, axis=1)

print("Customer Segmentation Summary:")
print("-" * 50)
print(rfm['Customer_Segment'].value_counts())

# Visualize customer segments
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
sns.scatterplot(data=rfm, x='Recency', y='Monetary', hue='Customer_Segment', 
                size='Frequency', sizes=(50, 500), alpha=0.6)
plt.title('Customer Segments - RFM Analysis')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.subplot(1, 2, 2)
sns.histplot(data=customer_frequency[('Order ID', 'count')], bins=30)
plt.title('Distribution of Customer Purchase Frequency')
plt.xlabel('Number of Orders')
plt.tight_layout()
plt.show()

# Customer segment characteristics
segment_stats = rfm.groupby('Customer_Segment').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean'
}).round(2)

print("\nSegment Characteristics:")
print("-" * 50)
print(segment_stats)

## 2. Sales Analysis

In [None]:
# Sales performance metrics
df['Order Value'] = df.groupby('Order ID')['Sales'].transform('sum')

# Calculate key metrics
sales_metrics = {
    'Total Revenue': df['Sales'].sum(),
    'Average Order Value': df.groupby('Order ID')['Sales'].sum().mean(),
    'Total Orders': df['Order ID'].nunique(),
    'Total Customers': df['Customer ID'].nunique(),
    'Average Sales per Customer': df.groupby('Customer ID')['Sales'].sum().mean()
}

print("Key Sales Metrics:")
print("-" * 50)
for metric, value in sales_metrics.items():
    print(f"{metric}: {value:,.2f}")

# Visualize sales patterns
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
monthly_sales = df.groupby(df['Order Date'].dt.to_period('M'))['Sales'].sum()
monthly_sales.plot(kind='line', marker='o')
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Total Sales')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
sns.boxplot(data=df, x=df['Order Date'].dt.month, y='Sales')
plt.title('Sales Distribution by Month')
plt.xlabel('Month')
plt.ylabel('Sales')

plt.tight_layout()
plt.show()

# Additional sales analysis
sales_by_weekday = df.groupby(df['Order Date'].dt.dayofweek)['Sales'].agg(['mean', 'count'])
sales_by_weekday.index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

plt.figure(figsize=(12, 6))
sns.barplot(data=sales_by_weekday.reset_index(), x='index', y='mean')
plt.title('Average Sales by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Average Sales')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Geographic Analysis

In [None]:
# City-level analysis
city_metrics = df.groupby(['City', 'Region']).agg({
    'Sales': ['sum', 'mean'],
    'Customer ID': 'nunique',
    'Order ID': 'nunique'
}).round(2)

# Flatten column names
city_metrics.columns = ['Total_Sales', 'Avg_Sales', 'Num_Customers', 'Num_Orders']
city_metrics = city_metrics.reset_index()

# Calculate additional metrics
city_metrics['Sales_per_Customer'] = city_metrics['Total_Sales'] / city_metrics['Num_Customers']
city_metrics['Orders_per_Customer'] = city_metrics['Num_Orders'] / city_metrics['Num_Customers']

# Sort by total sales and get top 10 cities
top_10_cities = city_metrics.nlargest(10, 'Total_Sales')

# Visualize top cities
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
sns.barplot(data=top_10_cities, x='City', y='Total_Sales')
plt.title('Top 10 Cities by Total Sales')
plt.xticks(rotation=45, ha='right')
plt.xlabel('City')
plt.ylabel('Total Sales')

plt.subplot(1, 2, 2)
region_sales = df.groupby('Region')['Sales'].sum().sort_values(ascending=False)
sns.barplot(x=region_sales.index, y=region_sales.values)
plt.title('Sales by Region')
plt.xticks(rotation=45)
plt.xlabel('Region')
plt.ylabel('Total Sales')

plt.tight_layout()
plt.show()

# Print top city statistics
print("\nTop 10 Cities Performance Metrics:")
print("-" * 50)
print(top_10_cities[['City', 'Region', 'Total_Sales', 'Num_Customers', 'Sales_per_Customer']].round(2))

# Regional customer distribution
regional_customers = df.groupby('Region')['Customer ID'].nunique().sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=regional_customers.index, y=regional_customers.values)
plt.title('Number of Customers by Region')
plt.xticks(rotation=45)
plt.xlabel('Region')
plt.ylabel('Number of Customers')
plt.tight_layout()
plt.show()