# 05 - Customer Segmentation Analysis

## Purpose
Analyze customer behavior and create meaningful segments using RFM analysis.

## Sections
1. Customer Overview
2. RFM Analysis
3. Customer Segments
4. Customer Value Analysis
5. Retention Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Load data
DATA_DIR = Path('../..') / 'ml' / 'data' / 'processed'

customers = pd.read_csv(DATA_DIR / 'customers.csv')
orders = pd.read_csv(DATA_DIR / 'orders.csv')
order_items = pd.read_csv(DATA_DIR / 'order_items.csv')

# Parse dates
orders['OrderDate'] = pd.to_datetime(orders['OrderDate'])

# Merge for analysis
full_orders = orders.merge(order_items, on='OrderID')

print(f"Customers: {len(customers):,}")
print(f"Orders: {len(orders):,}")

## 1. Customer Overview

In [None]:
# Customer type distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

customer_types = customers['Customer_Type'].value_counts()
axes[0].pie(customer_types, labels=customer_types.index, autopct='%1.1f%%', startangle=90)
axes[0].set_title('Customer Type Distribution')

# Geographic distribution (top 10 states)
state_counts = customers['State'].value_counts().head(10)
axes[1].barh(state_counts.index[::-1], state_counts.values[::-1])
axes[1].set_title('Top 10 States by Customer Count')
axes[1].set_xlabel('Number of Customers')

plt.tight_layout()
plt.show()

In [None]:
# Customer purchase summary
customer_summary = full_orders.groupby('CustomerID').agg({
    'OrderID': 'nunique',
    'TotalAmount': 'sum',
    'Profit': 'sum',
    'Quantity': 'sum',
    'OrderDate': ['min', 'max']
}).reset_index()
customer_summary.columns = ['CustomerID', 'OrderCount', 'TotalSpent', 'TotalProfit', 'TotalItems', 'FirstOrder', 'LastOrder']

print("\n=== Customer Purchase Statistics ===")
print(f"Customers with purchases: {len(customer_summary):,}")
print(f"Average orders per customer: {customer_summary['OrderCount'].mean():.2f}")
print(f"Average spend per customer: ${customer_summary['TotalSpent'].mean():,.2f}")
print(f"Median spend per customer: ${customer_summary['TotalSpent'].median():,.2f}")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

axes[0, 0].hist(customer_summary['OrderCount'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Orders per Customer Distribution')
axes[0, 0].set_xlabel('Number of Orders')

axes[0, 1].hist(customer_summary['TotalSpent'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Total Spend per Customer Distribution')
axes[0, 1].set_xlabel('Total Spent ($)')

axes[1, 0].hist(customer_summary['TotalItems'], bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].set_title('Items Purchased per Customer')
axes[1, 0].set_xlabel('Number of Items')

# Average order value
customer_summary['AvgOrderValue'] = customer_summary['TotalSpent'] / customer_summary['OrderCount']
axes[1, 1].hist(customer_summary['AvgOrderValue'], bins=50, edgecolor='black', alpha=0.7)
axes[1, 1].set_title('Average Order Value Distribution')
axes[1, 1].set_xlabel('Average Order Value ($)')

plt.tight_layout()
plt.show()

## 2. RFM Analysis

In [None]:
# Calculate RFM metrics
reference_date = orders['OrderDate'].max() + pd.Timedelta(days=1)

rfm = full_orders.groupby('CustomerID').agg({
    'OrderDate': lambda x: (reference_date - x.max()).days,  # Recency
    'OrderID': 'nunique',  # Frequency
    'TotalAmount': 'sum'  # Monetary
}).reset_index()
rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

print("\n=== RFM Statistics ===")
print(rfm.describe())

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

axes[0].hist(rfm['Recency'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Recency Distribution (days since last order)')
axes[0].set_xlabel('Days')

axes[1].hist(rfm['Frequency'], bins=30, edgecolor='black', alpha=0.7)
axes[1].set_title('Frequency Distribution (order count)')
axes[1].set_xlabel('Orders')

axes[2].hist(rfm['Monetary'], bins=50, edgecolor='black', alpha=0.7)
axes[2].set_title('Monetary Distribution (total spend)')
axes[2].set_xlabel('Total Spent ($)')

plt.tight_layout()
plt.show()

In [None]:
# RFM Scoring (1-5 scale)
rfm['R_Score'] = pd.qcut(rfm['Recency'], q=5, labels=[5, 4, 3, 2, 1])  # Lower recency = higher score
rfm['F_Score'] = pd.qcut(rfm['Frequency'].rank(method='first'), q=5, labels=[1, 2, 3, 4, 5])
rfm['M_Score'] = pd.qcut(rfm['Monetary'].rank(method='first'), q=5, labels=[1, 2, 3, 4, 5])

# Convert to int
rfm['R_Score'] = rfm['R_Score'].astype(int)
rfm['F_Score'] = rfm['F_Score'].astype(int)
rfm['M_Score'] = rfm['M_Score'].astype(int)

# Combined RFM Score
rfm['RFM_Score'] = rfm['R_Score'].astype(str) + rfm['F_Score'].astype(str) + rfm['M_Score'].astype(str)
rfm['RFM_Total'] = rfm['R_Score'] + rfm['F_Score'] + rfm['M_Score']

print("\n=== RFM Score Distribution ===")
print(rfm[['R_Score', 'F_Score', 'M_Score', 'RFM_Total']].describe())

## 3. Customer Segments

In [None]:
# Define customer segments based on RFM scores
def segment_customer(row):
    r, f, m = row['R_Score'], row['F_Score'], row['M_Score']
    
    if r >= 4 and f >= 4 and m >= 4:
        return 'Champions'
    elif r >= 4 and f >= 3:
        return 'Loyal Customers'
    elif r >= 3 and f >= 1 and m >= 4:
        return 'Potential Loyalists'
    elif r >= 4 and f <= 2:
        return 'Recent Customers'
    elif r >= 3 and f >= 3 and m >= 3:
        return 'Promising'
    elif r <= 2 and f >= 4 and m >= 4:
        return 'At Risk'
    elif r <= 2 and f >= 3:
        return 'Cant Lose Them'
    elif r <= 2 and f <= 2:
        return 'Hibernating'
    else:
        return 'Need Attention'

rfm['Segment'] = rfm.apply(segment_customer, axis=1)

# Segment summary
segment_summary = rfm.groupby('Segment').agg({
    'CustomerID': 'count',
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': ['mean', 'sum']
}).round(2)
segment_summary.columns = ['Count', 'Avg_Recency', 'Avg_Frequency', 'Avg_Monetary', 'Total_Monetary']
segment_summary = segment_summary.sort_values('Total_Monetary', ascending=False)

print("\n=== Customer Segments ===")
display(segment_summary)

In [None]:
# Visualize segments
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Segment distribution
segment_counts = rfm['Segment'].value_counts()
axes[0].pie(segment_counts, labels=segment_counts.index, autopct='%1.1f%%', startangle=90)
axes[0].set_title('Customer Segment Distribution')

# Segment revenue contribution
segment_revenue = rfm.groupby('Segment')['Monetary'].sum().sort_values(ascending=True)
axes[1].barh(segment_revenue.index, segment_revenue.values / 1000)
axes[1].set_title('Revenue by Customer Segment')
axes[1].set_xlabel('Revenue ($K)')

plt.tight_layout()
plt.show()

In [None]:
# RFM Heatmap
rfm_heatmap = rfm.groupby(['R_Score', 'F_Score'])['Monetary'].mean().unstack()

plt.figure(figsize=(10, 8))
sns.heatmap(rfm_heatmap, annot=True, fmt='.0f', cmap='YlGnBu')
plt.title('Average Monetary Value by Recency and Frequency Scores')
plt.xlabel('Frequency Score')
plt.ylabel('Recency Score')
plt.tight_layout()
plt.show()

## 4. Customer Value Analysis

In [None]:
# Customer Lifetime Value proxy (CLV)
customer_lifetime = customer_summary.copy()
customer_lifetime['CustomerLifetime'] = (customer_lifetime['LastOrder'] - customer_lifetime['FirstOrder']).dt.days
customer_lifetime['CLV_Score'] = (
    customer_lifetime['TotalSpent'] * 
    (customer_lifetime['OrderCount'] / (customer_lifetime['CustomerLifetime'].replace(0, 1) / 365))
)

# Top 20 most valuable customers
top_customers = customer_lifetime.nlargest(20, 'TotalSpent')

plt.figure(figsize=(12, 8))
plt.barh(range(20), top_customers['TotalSpent'] / 1000)
plt.yticks(range(20), top_customers['CustomerID'])
plt.xlabel('Total Spent ($K)')
plt.title('Top 20 Customers by Total Spend')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Pareto analysis
customer_lifetime_sorted = customer_lifetime.sort_values('TotalSpent', ascending=False)
customer_lifetime_sorted['CumulativeRevenue'] = customer_lifetime_sorted['TotalSpent'].cumsum()
customer_lifetime_sorted['CumulativePct'] = customer_lifetime_sorted['CumulativeRevenue'] / customer_lifetime_sorted['TotalSpent'].sum() * 100
customer_lifetime_sorted['CustomerPct'] = (np.arange(1, len(customer_lifetime_sorted) + 1) / len(customer_lifetime_sorted)) * 100

plt.figure(figsize=(10, 6))
plt.plot(customer_lifetime_sorted['CustomerPct'], customer_lifetime_sorted['CumulativePct'])
plt.axhline(y=80, color='r', linestyle='--', label='80% Revenue')
plt.axvline(x=20, color='g', linestyle='--', label='20% Customers')
plt.xlabel('% of Customers')
plt.ylabel('% of Cumulative Revenue')
plt.title('Customer Revenue Pareto Chart')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Find what % of customers drive 80% revenue
pct_for_80 = customer_lifetime_sorted[customer_lifetime_sorted['CumulativePct'] >= 80]['CustomerPct'].iloc[0]
print(f"\n{pct_for_80:.1f}% of customers generate 80% of revenue")

## 5. Retention Analysis

In [None]:
# Cohort analysis
orders_cohort = orders.copy()
orders_cohort['OrderMonth'] = orders_cohort['OrderDate'].dt.to_period('M')

# Get first purchase month for each customer
first_purchase = orders_cohort.groupby('CustomerID')['OrderMonth'].min().reset_index()
first_purchase.columns = ['CustomerID', 'CohortMonth']

orders_cohort = orders_cohort.merge(first_purchase, on='CustomerID')

# Calculate cohort index (months since first purchase)
orders_cohort['CohortIndex'] = (orders_cohort['OrderMonth'] - orders_cohort['CohortMonth']).apply(lambda x: x.n)

# Cohort matrix
cohort_data = orders_cohort.groupby(['CohortMonth', 'CohortIndex'])['CustomerID'].nunique().reset_index()
cohort_matrix = cohort_data.pivot(index='CohortMonth', columns='CohortIndex', values='CustomerID')

# Retention rates
cohort_sizes = cohort_matrix.iloc[:, 0]
retention_matrix = cohort_matrix.divide(cohort_sizes, axis=0) * 100

# Plot retention heatmap (first 12 months)
plt.figure(figsize=(14, 10))
sns.heatmap(retention_matrix.iloc[:, :12], annot=True, fmt='.0f', cmap='YlGnBu', 
            xticklabels=[f'Month {i}' for i in range(12)],
            yticklabels=[str(m) for m in retention_matrix.index])
plt.title('Customer Retention by Cohort (%)')
plt.xlabel('Months Since First Purchase')
plt.ylabel('Cohort Month')
plt.tight_layout()
plt.show()

In [None]:
print("\n" + "="*60)
print("CUSTOMER SEGMENTATION SUMMARY")
print("="*60)

print(f"\n=== Overall Customer Metrics ===")
print(f"Total Customers: {len(customers):,}")
print(f"Active Customers (with orders): {len(customer_summary):,}")
print(f"Average Orders per Customer: {customer_summary['OrderCount'].mean():.2f}")
print(f"Average Spend per Customer: ${customer_summary['TotalSpent'].mean():,.2f}")

print(f"\n=== Customer Segments ===")
for segment in segment_summary.index:
    count = segment_summary.loc[segment, 'Count']
    pct = count / len(rfm) * 100
    revenue = segment_summary.loc[segment, 'Total_Monetary']
    print(f"  {segment}: {count:,} customers ({pct:.1f}%) - ${revenue:,.2f} revenue")

print(f"\n=== Key Insights ===")
print(f"Champions: {len(rfm[rfm['Segment'] == 'Champions']):,} customers (highest value)")
print(f"At Risk: {len(rfm[rfm['Segment'] == 'At Risk']):,} customers (need re-engagement)")
print(f"Hibernating: {len(rfm[rfm['Segment'] == 'Hibernating']):,} customers (lost)")