# 05 - Customer Segmentation

This notebook performs advanced customer segmentation using clustering techniques to complement the RFM analysis in the EDA notebook.

## Objectives
- Calculate RFM metrics (Recency, Frequency, Monetary)
- Perform customer segmentation using K-Means clustering
- Determine optimal number of clusters (Elbow method, Silhouette score)
- Compare RFM-based segmentation vs K-Means clustering
- Interpret and visualize customer segments
- Analyze segment characteristics and business implications

## Phase 2 Requirements
- ✅ RFM analysis (already in EDA)
- ✅ Advanced clustering (K-Means)
- ✅ Segment interpretation
- ✅ Business recommendations


In [None]:
# Load required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import warnings
import os

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("=" * 80)
print("CUSTOMER SEGMENTATION - ADVANCED CLUSTERING ANALYSIS")
print("=" * 80)

# Load data
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
data_path = os.path.join(project_root, 'data', 'raw', 'Online Retail.csv')

df = pd.read_csv(data_path, encoding='latin-1')
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
df = df[df['Description'].notna()]
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
df = df[df['InvoiceDate'].notna()]
df = df[df['CustomerID'].notna()]

print(f"\nDataset loaded: {df.shape[0]:,} transactions")
print(f"Unique customers: {df['CustomerID'].nunique():,}")
print(f"Date range: {df['InvoiceDate'].min()} to {df['InvoiceDate'].max()}")




## Step 1: Calculate RFM Metrics

Calculate Recency, Frequency, and Monetary values for each customer.


In [None]:
# Calculate RFM metrics
print("=" * 80)
print("RFM METRICS CALCULATION")
print("=" * 80)

# Reference date (last transaction date + 1 day)
reference_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)

# Calculate RFM
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (reference_date - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',  # Frequency
    'TotalPrice': 'sum'  # Monetary
}).reset_index()

rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

# Log transform for better clustering (handle skewness)
rfm['Recency_log'] = np.log1p(rfm['Recency'])
rfm['Frequency_log'] = np.log1p(rfm['Frequency'])
rfm['Monetary_log'] = np.log1p(rfm['Monetary'])

print(f"\nRFM Metrics Summary:")
print(rfm[['Recency', 'Frequency', 'Monetary']].describe())

print(f"\nRFM Statistics:")
print(f"  Total Customers: {len(rfm):,}")
print(f"  Average Recency: {rfm['Recency'].mean():.1f} days")
print(f"  Average Frequency: {rfm['Frequency'].mean():.2f} transactions")
print(f"  Average Monetary Value: £{rfm['Monetary'].mean():,.2f}")
print(f"  Median Monetary Value: £{rfm['Monetary'].median():,.2f}")

# Display sample
print("\nSample RFM Data:")
print(rfm.head(10))


## Step 2: Determine Optimal Number of Clusters

Use Elbow method and Silhouette score to find the optimal number of clusters.


In [None]:
# Prepare data for clustering (use log-transformed features)
features = ['Recency_log', 'Frequency_log', 'Monetary_log']
X = rfm[features].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("=" * 80)
print("OPTIMAL NUMBER OF CLUSTERS")
print("=" * 80)

# Test different numbers of clusters
k_range = range(2, 11)
inertias = []
silhouette_scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

# Find optimal k (elbow method and silhouette)
optimal_k_elbow = None
optimal_k_silhouette = k_range[np.argmax(silhouette_scores)]

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Determining Optimal Number of Clusters', fontsize=16, y=1.02)

# Elbow method
axes[0].plot(k_range, inertias, marker='o', linewidth=2, markersize=8, color='steelblue')
axes[0].set_xlabel('Number of Clusters (k)', fontsize=12)
axes[0].set_ylabel('Inertia (Within-cluster SSE)', fontsize=12)
axes[0].set_title('Elbow Method', fontweight='bold')
axes[0].grid(True, alpha=0.3)
axes[0].set_xticks(k_range)

# Silhouette score
axes[1].plot(k_range, silhouette_scores, marker='o', linewidth=2, markersize=8, color='coral')
axes[1].axvline(x=optimal_k_silhouette, color='red', linestyle='--', linewidth=2, label=f'Optimal k={optimal_k_silhouette}')
axes[1].set_xlabel('Number of Clusters (k)', fontsize=12)
axes[1].set_ylabel('Silhouette Score', fontsize=12)
axes[1].set_title('Silhouette Score Method', fontweight='bold')
axes[1].grid(True, alpha=0.3)
axes[1].legend()
axes[1].set_xticks(k_range)

plt.tight_layout()
plt.show()

print(f"\nOptimal Number of Clusters:")
print(f"  Elbow method: Visual inspection needed (see plot)")
print(f"  Silhouette method: k = {optimal_k_silhouette} (score: {max(silhouette_scores):.4f})")

# Use silhouette-based optimal k
optimal_k = optimal_k_silhouette
print(f"\nSelected k = {optimal_k} for clustering")


## Step 3: Perform K-Means Clustering

Apply K-Means clustering with the optimal number of clusters.


In [None]:
# Perform K-Means clustering
print("=" * 80)
print("K-MEANS CLUSTERING")
print("=" * 80)

kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
rfm['Cluster'] = kmeans.fit_predict(X_scaled)

# Calculate cluster statistics
cluster_stats = rfm.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean',
    'CustomerID': 'count'
}).round(2)

cluster_stats.columns = ['Avg_Recency', 'Avg_Frequency', 'Avg_Monetary', 'Count']
cluster_stats['Percentage'] = (cluster_stats['Count'] / len(rfm) * 100).round(2)

print("\nCluster Statistics:")
print(cluster_stats)

# Silhouette score for final clustering
final_silhouette = silhouette_score(X_scaled, rfm['Cluster'])
print(f"\nFinal Silhouette Score: {final_silhouette:.4f}")
print(f"  (Score > 0.5: Good separation, > 0.7: Strong structure)")

# Visualize clusters
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Customer Segmentation - K-Means Clustering', fontsize=16, y=0.995)

# Recency vs Frequency
scatter1 = axes[0, 0].scatter(rfm['Recency'], rfm['Frequency'], c=rfm['Cluster'], 
                              cmap='viridis', alpha=0.6, s=50, edgecolors='black', linewidth=0.5)
axes[0, 0].set_xlabel('Recency (days)', fontsize=12)
axes[0, 0].set_ylabel('Frequency (transactions)', fontsize=12)
axes[0, 0].set_title('Recency vs Frequency', fontweight='bold')
axes[0, 0].grid(True, alpha=0.3)
plt.colorbar(scatter1, ax=axes[0, 0], label='Cluster')

# Frequency vs Monetary
scatter2 = axes[0, 1].scatter(rfm['Frequency'], rfm['Monetary'], c=rfm['Cluster'],
                              cmap='viridis', alpha=0.6, s=50, edgecolors='black', linewidth=0.5)
axes[0, 1].set_xlabel('Frequency (transactions)', fontsize=12)
axes[0, 1].set_ylabel('Monetary Value (£)', fontsize=12)
axes[0, 1].set_title('Frequency vs Monetary', fontweight='bold')
axes[0, 1].grid(True, alpha=0.3)
plt.colorbar(scatter2, ax=axes[0, 1], label='Cluster')

# Recency vs Monetary
scatter3 = axes[1, 0].scatter(rfm['Recency'], rfm['Monetary'], c=rfm['Cluster'],
                              cmap='viridis', alpha=0.6, s=50, edgecolors='black', linewidth=0.5)
axes[1, 0].set_xlabel('Recency (days)', fontsize=12)
axes[1, 0].set_ylabel('Monetary Value (£)', fontsize=12)
axes[1, 0].set_title('Recency vs Monetary', fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)
plt.colorbar(scatter3, ax=axes[1, 0], label='Cluster')

# Cluster distribution
cluster_counts = rfm['Cluster'].value_counts().sort_index()
axes[1, 1].bar(range(len(cluster_counts)), cluster_counts.values, alpha=0.7, color='steelblue', edgecolor='black')
axes[1, 1].set_xlabel('Cluster', fontsize=12)
axes[1, 1].set_ylabel('Number of Customers', fontsize=12)
axes[1, 1].set_title('Cluster Distribution', fontweight='bold')
axes[1, 1].set_xticks(range(len(cluster_counts)))
axes[1, 1].set_xticklabels(cluster_counts.index)
axes[1, 1].grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for i, (idx, val) in enumerate(cluster_counts.items()):
    axes[1, 1].text(i, val, f'{val:,}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()


In [None]:
# Interpret clusters
print("=" * 80)
print("CLUSTER INTERPRETATION")
print("=" * 80)

# Calculate percentiles for comparison
recency_median = rfm['Recency'].median()
frequency_median = rfm['Frequency'].median()
monetary_median = rfm['Monetary'].median()

# Label clusters based on characteristics
def label_cluster(row):
    cluster = row['Cluster']
    recency = row['Recency']
    frequency = row['Frequency']
    monetary = row['Monetary']
    
    # Simple labeling logic (can be refined)
    if recency < recency_median and frequency > frequency_median and monetary > monetary_median:
        return 'Champions'
    elif recency < recency_median and frequency > frequency_median:
        return 'Loyal Customers'
    elif recency > recency_median and frequency < frequency_median:
        return 'At Risk'
    elif recency < recency_median and frequency < frequency_median:
        return 'New Customers'
    elif monetary > monetary_median:
        return 'High Value'
    else:
        return 'Regular'

# Apply labeling (simplified - can be enhanced)
rfm['Segment_Label'] = rfm.apply(label_cluster, axis=1)

# Detailed cluster analysis
print("\nDetailed Cluster Analysis:")
for cluster_id in sorted(rfm['Cluster'].unique()):
    cluster_data = rfm[rfm['Cluster'] == cluster_id]
    print(f"\n--- Cluster {cluster_id} ---")
    print(f"  Size: {len(cluster_data):,} customers ({len(cluster_data)/len(rfm)*100:.1f}%)")
    print(f"  Avg Recency: {cluster_data['Recency'].mean():.1f} days")
    print(f"  Avg Frequency: {cluster_data['Frequency'].mean():.2f} transactions")
    print(f"  Avg Monetary: £{cluster_data['Monetary'].mean():,.2f}")
    print(f"  Total Revenue: £{cluster_data['Monetary'].sum():,.2f}")
    print(f"  Revenue Share: {cluster_data['Monetary'].sum()/rfm['Monetary'].sum()*100:.1f}%")

# Visualize cluster characteristics
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Cluster Characteristics Comparison', fontsize=16, y=1.02)

clusters = sorted(rfm['Cluster'].unique())
x_pos = np.arange(len(clusters))

# Average Recency by cluster
recency_means = [rfm[rfm['Cluster'] == c]['Recency'].mean() for c in clusters]
axes[0].bar(x_pos, recency_means, alpha=0.7, color='steelblue', edgecolor='black')
axes[0].set_xlabel('Cluster', fontsize=12)
axes[0].set_ylabel('Average Recency (days)', fontsize=12)
axes[0].set_title('Average Recency by Cluster', fontweight='bold')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(clusters)
axes[0].grid(True, alpha=0.3, axis='y')

# Average Frequency by cluster
frequency_means = [rfm[rfm['Cluster'] == c]['Frequency'].mean() for c in clusters]
axes[1].bar(x_pos, frequency_means, alpha=0.7, color='coral', edgecolor='black')
axes[1].set_xlabel('Cluster', fontsize=12)
axes[1].set_ylabel('Average Frequency', fontsize=12)
axes[1].set_title('Average Frequency by Cluster', fontweight='bold')
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(clusters)
axes[1].grid(True, alpha=0.3, axis='y')

# Average Monetary by cluster
monetary_means = [rfm[rfm['Cluster'] == c]['Monetary'].mean() for c in clusters]
axes[2].bar(x_pos, monetary_means, alpha=0.7, color='teal', edgecolor='black')
axes[2].set_xlabel('Cluster', fontsize=12)
axes[2].set_ylabel('Average Monetary Value (£)', fontsize=12)
axes[2].set_title('Average Monetary Value by Cluster', fontweight='bold')
axes[2].set_xticks(x_pos)
axes[2].set_xticklabels(clusters)
axes[2].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()


In [None]:
# Business implications
print("=" * 80)
print("BUSINESS IMPLICATIONS & RECOMMENDATIONS")
print("=" * 80)

# Identify key clusters
total_revenue = rfm['Monetary'].sum()
cluster_revenue = rfm.groupby('Cluster')['Monetary'].sum().sort_values(ascending=False)

print("\n1. REVENUE CONCENTRATION:")
print("   Cluster revenue distribution:")
for cluster_id, revenue in cluster_revenue.items():
    pct = revenue / total_revenue * 100
    print(f"   Cluster {cluster_id}: £{revenue:,.2f} ({pct:.1f}%)")

top_cluster = cluster_revenue.index[0]
print(f"\n   → Top cluster (Cluster {top_cluster}) contributes {cluster_revenue.iloc[0]/total_revenue*100:.1f}% of revenue")
print(f"   → Focus stock optimization on this segment")

print("\n2. STOCK MANAGEMENT BY CLUSTER:")
for cluster_id in sorted(rfm['Cluster'].unique()):
    cluster_data = rfm[rfm['Cluster'] == cluster_id]
    avg_frequency = cluster_data['Frequency'].mean()
    avg_monetary = cluster_data['Monetary'].mean()
    
    print(f"\n   Cluster {cluster_id}:")
    print(f"   - Average purchase frequency: {avg_frequency:.2f}")
    print(f"   - Average transaction value: £{avg_monetary:,.2f}")
    
    if avg_frequency > frequency_median and avg_monetary > monetary_median:
        print(f"   - Recommendation: HIGH PRIORITY - Maintain optimal stock levels")
        print(f"   - Action: Ensure product availability, premium service")
    elif avg_frequency < frequency_median:
        print(f"   - Recommendation: RE-ENGAGEMENT - Win-back campaigns")
        print(f"   - Action: Targeted promotions, personalized offers")
    else:
        print(f"   - Recommendation: STANDARD - Regular stock levels")
        print(f"   - Action: Monitor trends, maintain service quality")

print("\n3. CUSTOMER RETENTION STRATEGY:")
low_recency_clusters = rfm.groupby('Cluster')['Recency'].mean().sort_values()
print("   Clusters by recency (most recent first):")
for cluster_id, avg_recency in low_recency_clusters.items():
    print(f"   Cluster {cluster_id}: {avg_recency:.1f} days since last purchase")
    if avg_recency > recency_median * 1.5:
        print(f"      → AT RISK: Consider win-back campaigns")

print("\n4. SEGMENT-SPECIFIC STOCK PLANNING:")
print("   - High-frequency clusters: Ensure consistent stock availability")
print("   - High-monetary clusters: Stock premium/high-value products")
print("   - Low-frequency clusters: Focus on re-engagement, not stock expansion")
print("   - Recent purchasers: Maintain current stock levels")
print("   - Dormant customers: Targeted campaigns before stock reduction")

print("\n5. CROSS-SELLING OPPORTUNITIES:")
high_value_clusters = rfm[rfm['Monetary'] > monetary_median]['Cluster'].unique()
print(f"   - High-value clusters identified: {sorted(high_value_clusters)}")
print("   - Recommendation: Use association rules to cross-sell to these segments")
print("   - Action: Bundle products based on cluster preferences")

print("\n" + "=" * 80)
print("CUSTOMER SEGMENTATION ANALYSIS COMPLETE")
print("=" * 80)

# Export results
output_path = os.path.join(project_root, 'data', 'processed', 'customer_segments.csv')
rfm[['CustomerID', 'Recency', 'Frequency', 'Monetary', 'Cluster']].to_csv(output_path, index=False)
print(f"\nResults exported to: {output_path}")


## Step 6: Additional Cluster Visualizations

Enhanced visualizations for better understanding of customer segments.


In [None]:
# Additional comprehensive visualizations
print("=" * 80)
print("ADDITIONAL CLUSTER VISUALIZATIONS")
print("=" * 80)

# 1. RFM distribution by cluster
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Customer Segmentation - Comprehensive Analysis', fontsize=16, y=0.995)

# RFM distributions by cluster
clusters = sorted(rfm['Cluster'].unique())

# Recency distribution by cluster
for cluster_id in clusters:
    cluster_data = rfm[rfm['Cluster'] == cluster_id]['Recency']
    axes[0, 0].hist(cluster_data, alpha=0.6, label=f'Cluster {cluster_id}', bins=20)
axes[0, 0].set_xlabel('Recency (days)', fontsize=12)
axes[0, 0].set_ylabel('Frequency', fontsize=12)
axes[0, 0].set_title('Recency Distribution by Cluster', fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3, axis='y')

# Frequency distribution by cluster
for cluster_id in clusters:
    cluster_data = rfm[rfm['Cluster'] == cluster_id]['Frequency']
    axes[0, 1].hist(cluster_data, alpha=0.6, label=f'Cluster {cluster_id}', bins=20)
axes[0, 1].set_xlabel('Frequency (transactions)', fontsize=12)
axes[0, 1].set_ylabel('Frequency', fontsize=12)
axes[0, 1].set_title('Frequency Distribution by Cluster', fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3, axis='y')

# Monetary distribution by cluster
for cluster_id in clusters:
    cluster_data = rfm[rfm['Cluster'] == cluster_id]['Monetary']
    axes[0, 2].hist(cluster_data, alpha=0.6, label=f'Cluster {cluster_id}', bins=20)
axes[0, 2].set_xlabel('Monetary Value (£)', fontsize=12)
axes[0, 2].set_ylabel('Frequency', fontsize=12)
axes[0, 2].set_title('Monetary Distribution by Cluster', fontweight='bold')
axes[0, 2].legend()
axes[0, 2].grid(True, alpha=0.3, axis='y')

# Box plots for RFM by cluster
rfm_melted = rfm.melt(id_vars=['Cluster'], value_vars=['Recency', 'Frequency', 'Monetary'],
                      var_name='Metric', value_name='Value')
sns.boxplot(data=rfm_melted, x='Cluster', y='Value', hue='Metric', ax=axes[1, 0])
axes[1, 0].set_title('RFM Metrics Distribution by Cluster (Boxplot)', fontweight='bold')
axes[1, 0].set_yscale('log')  # Log scale for better visualization
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Revenue contribution by cluster
cluster_revenue = rfm.groupby('Cluster')['Monetary'].sum().sort_values(ascending=False)
axes[1, 1].pie(cluster_revenue.values, labels=[f'Cluster {c}' for c in cluster_revenue.index],
               autopct='%1.1f%%', startangle=90, colors=plt.cm.viridis(np.linspace(0, 1, len(cluster_revenue))))
axes[1, 1].set_title('Revenue Contribution by Cluster', fontweight='bold')

# Customer count by cluster
cluster_counts = rfm['Cluster'].value_counts().sort_index()
bars = axes[1, 2].bar(range(len(cluster_counts)), cluster_counts.values, 
                      alpha=0.7, color='steelblue', edgecolor='black')
axes[1, 2].set_xlabel('Cluster', fontsize=12)
axes[1, 2].set_ylabel('Number of Customers', fontsize=12)
axes[1, 2].set_title('Customer Count by Cluster', fontweight='bold')
axes[1, 2].set_xticks(range(len(cluster_counts)))
axes[1, 2].set_xticklabels([f'Cluster {c}' for c in cluster_counts.index])
axes[1, 2].grid(True, alpha=0.3, axis='y')
for i, (bar, count) in enumerate(zip(bars, cluster_counts.values)):
    axes[1, 2].text(bar.get_x() + bar.get_width()/2., count,
                   f'{count:,}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# 2. 3D scatter plot (if possible)
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(14, 10))
ax = fig.add_subplot(111, projection='3d')

for cluster_id in clusters:
    cluster_data = rfm[rfm['Cluster'] == cluster_id]
    ax.scatter(cluster_data['Recency'], cluster_data['Frequency'], cluster_data['Monetary'],
              label=f'Cluster {cluster_id}', alpha=0.6, s=20)

ax.set_xlabel('Recency (days)', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
ax.set_zlabel('Monetary Value (£)', fontsize=12)
ax.set_title('3D Visualization: RFM by Cluster', fontweight='bold', fontsize=14, pad=20)
ax.legend()
plt.tight_layout()
plt.show()

print("\n" + "=" * 80)
