In [1]:
# ============================================================================
# 1. IMPORT LIBRARIES AND LOAD DATA
# ============================================================================

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA

# Market Basket Analysis
try:
    from mlxtend.frequent_patterns import apriori, association_rules
    from mlxtend.preprocessing import TransactionEncoder
    MBA_AVAILABLE = True
except ImportError:
    MBA_AVAILABLE = False
    print("mlxtend not installed. Run: pip install mlxtend")

from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore")

# Ensure output folders exist (PROJECT STANDARD)
os.makedirs("../figures", exist_ok=True)
os.makedirs("../reports", exist_ok=True)
os.makedirs("../data/processed", exist_ok=True)

# Load cleaned data (notebook is inside /notebooks)
df = pd.read_csv("../data/processed/cleaned_retail_sales.csv")

# Convert date columns
df["Order_Date"] = pd.to_datetime(df["Order_Date"])
df["Ship_Date"] = pd.to_datetime(df["Ship_Date"])

print("=" * 80)
print("CUSTOMER SEGMENTATION AND ADVANCED ANALYTICS")
print("=" * 80)
print(f"\nDataset Shape: {df.shape}")



CUSTOMER SEGMENTATION AND ADVANCED ANALYTICS

Dataset Shape: (10000, 45)


In [3]:
# ============================================================================
# 2. RFM FEATURE ENGINEERING (CUSTOMER-LEVEL)
# ============================================================================

from datetime import timedelta
import os

print("\n" + "=" * 80)
print("CUSTOMER SEGMENTATION AND ADVANCED ANALYTICS")
print("=" * 80)
print(f"\nDataset Shape: {df.shape}")

# Ensure processed data directory exists
os.makedirs("../data/processed", exist_ok=True)

# Define analysis reference date
analysis_date = df["Order_Date"].max() + timedelta(days=1)

# Create RFM table
rfm = df.groupby("Customer_ID").agg({
    "Order_Date": lambda x: (analysis_date - x.max()).days,  # Recency
    "Order_ID": "count",                                     # Frequency
    "Sales": "sum"                                           # Monetary
}).reset_index()

rfm.columns = ["Customer_ID", "Recency", "Frequency", "Monetary"]

# RFM scoring
rfm["R_Score"] = pd.qcut(
    rfm["Recency"], 5, labels=[5, 4, 3, 2, 1], duplicates="drop"
)

rfm["F_Score"] = pd.qcut(
    rfm["Frequency"].rank(method="first"),
    5,
    labels=[1, 2, 3, 4, 5],
    duplicates="drop"
)

rfm["M_Score"] = pd.qcut(
    rfm["Monetary"], 5, labels=[1, 2, 3, 4, 5], duplicates="drop"
)

# Composite RFM score
rfm["RFM_Score_Numeric"] = (
    rfm["R_Score"].astype(int)
    + rfm["F_Score"].astype(int)
    + rfm["M_Score"].astype(int)
) / 3

print("RFM table created successfully")
print("RFM Shape:", rfm.shape)



CUSTOMER SEGMENTATION AND ADVANCED ANALYTICS

Dataset Shape: (10000, 45)
RFM table created successfully
RFM Shape: (1986, 8)


In [4]:
# ============================================================================
# 3. CUSTOMER SEGMENT ASSIGNMENT (RFM-BASED)
# ============================================================================

print("\n" + "=" * 80)
print("RFM-BASED CUSTOMER SEGMENTATION")
print("=" * 80)

def segment_customer(row):
    r, f, m = int(row["R_Score"]), int(row["F_Score"]), int(row["M_Score"])
    if r >= 4 and f >= 4 and m >= 4:
        return "Champions"
    elif r >= 3 and f >= 3:
        return "Loyal"
    elif r >= 4 and f <= 2:
        return "New Customers"
    elif r <= 2 and f >= 3:
        return "At Risk"
    else:
        return "Others"

rfm["Customer_Segment"] = rfm.apply(segment_customer, axis=1)

print("\nCustomer Segment Distribution:")
print(rfm["Customer_Segment"].value_counts())




RFM-BASED CUSTOMER SEGMENTATION

Customer Segment Distribution:
Customer_Segment
Others           576
Loyal            494
Champions        353
At Risk          344
New Customers    219
Name: count, dtype: int64


In [5]:
rfm.to_csv('../data/processed/rfm_analysis.csv', index=False)
print("Saved rfm_analysis.csv")


Saved rfm_analysis.csv


In [6]:
# ============================================================================
# 4. K-MEANS CLUSTERING – OPTIMAL CLUSTER SELECTION
# ============================================================================

print("\n" + "=" * 80)
print("K-MEANS CLUSTERING: SILHOUETTE ANALYSIS")
print("=" * 80)

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Select features for clustering
X = rfm[["Recency", "Frequency", "Monetary"]]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Silhouette analysis to find optimal k
k_values = list(range(2, 8))
scores = []

for k in k_values:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    scores.append(score)
    print(f"k = {k}, silhouette score = {score:.3f}")

# Select optimal k
optimal_k = k_values[scores.index(max(scores))]
print(f"\nOptimal number of clusters selected: {optimal_k}")




K-MEANS CLUSTERING: SILHOUETTE ANALYSIS
k = 2, silhouette score = 0.381
k = 3, silhouette score = 0.372
k = 4, silhouette score = 0.325
k = 5, silhouette score = 0.335
k = 6, silhouette score = 0.312
k = 7, silhouette score = 0.294

Optimal number of clusters selected: 2


In [7]:
# ============================================================================
# 5. FINAL K-MEANS CLUSTER ASSIGNMENT
# ============================================================================

print("\n" + "=" * 80)
print("FINAL CUSTOMER CLUSTER ASSIGNMENT")
print("=" * 80)

# Fit K-Means with optimal number of clusters
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
rfm["Cluster"] = kmeans.fit_predict(X_scaled)

print("\nCluster Distribution:")
print(rfm["Cluster"].value_counts())




FINAL CUSTOMER CLUSTER ASSIGNMENT

Cluster Distribution:
Cluster
1    1062
0     924
Name: count, dtype: int64


In [8]:
# ============================================================================
# 6. DIMENSIONALITY REDUCTION USING PCA
# ============================================================================

print("\n" + "=" * 80)
print("PCA FOR CLUSTER VISUALIZATION")
print("=" * 80)

from sklearn.decomposition import PCA

# Apply PCA for 2D visualization
pca = PCA(n_components=2)
components = pca.fit_transform(X_scaled)

# Store PCA components
rfm["PCA1"] = components[:, 0]
rfm["PCA2"] = components[:, 1]

print("PCA variance explained by components:")
print(pca.explained_variance_ratio_)




PCA FOR CLUSTER VISUALIZATION
PCA variance explained by components:
[0.70721995 0.24515155]


In [9]:
rfm.to_csv('../data/processed/customer_segments.csv', index=False)
print("Saved customer_segments.csv")


Saved customer_segments.csv


In [10]:

# ============================================================================
# 7. MARKET BASKET ANALYSIS (ASSOCIATION RULE MINING)
# ============================================================================

import os
os.makedirs("../reports", exist_ok=True)

print("\n" + "=" * 80)
print("MARKET BASKET ANALYSIS")
print("=" * 80)

if MBA_AVAILABLE:

    from mlxtend.frequent_patterns import apriori, association_rules
    from mlxtend.preprocessing import TransactionEncoder

    # Prepare transaction data
    transactions = df.groupby("Order_ID")["Product_ID"].apply(list).tolist()

    te = TransactionEncoder()
    encoded = te.fit(transactions).transform(transactions)
    df_encoded = pd.DataFrame(encoded, columns=te.columns_)

    # Frequent itemset mining
    frequent_itemsets = apriori(
        df_encoded,
        min_support=0.01,
        use_colnames=True
    )

    if frequent_itemsets.empty:
        print("No frequent itemsets found at 1% support. MBA skipped.")
    else:
        rules = association_rules(
            frequent_itemsets,
            metric="lift",
            min_threshold=1
        )

        rules_path = "../reports/market_basket_rules.csv"
        rules.to_csv(rules_path, index=False)
        print(f"Market basket rules saved ({len(rules)} rules)")
        print(f"Saved to: {rules_path}")

else:
    print("Market Basket Analysis skipped (mlxtend not installed)")




MARKET BASKET ANALYSIS
No frequent itemsets found at 1% support. MBA skipped.


In [11]:
# ============================================================================
# 8. COHORT ANALYSIS – CUSTOMER RETENTION
# ============================================================================

print("\n" + "=" * 80)
print("COHORT ANALYSIS: CUSTOMER RETENTION")
print("=" * 80)

# Prepare cohort data
df_cohort = df.copy()
df_cohort["Order_Month"] = df_cohort["Order_Date"].dt.to_period("M")
df_cohort["Cohort"] = (
    df_cohort
    .groupby("Customer_ID")["Order_Date"]
    .transform("min")
    .dt.to_period("M")
)

df_cohort["Cohort_Index"] = (
    df_cohort["Order_Month"] - df_cohort["Cohort"]
).apply(lambda x: x.n)

# Build retention matrix
retention = (
    df_cohort
    .groupby(["Cohort", "Cohort_Index"])["Customer_ID"]
    .nunique()
    .unstack(fill_value=0)
)

# Retention rate calculation
retention_rate = retention.div(retention.iloc[:, 0], axis=0)

print("Cohort Retention Rate (sample):")
print(retention_rate.head())




COHORT ANALYSIS: CUSTOMER RETENTION
Cohort Retention Rate (sample):
Cohort_Index   0         1         2         3         4         5         6   \
Cohort                                                                          
2022-01       1.0  0.268212  0.316225  0.291391  0.311258  0.326159  0.304636   
2022-02       1.0  0.308861  0.288608  0.283544  0.354430  0.296203  0.303797   
2022-03       1.0  0.316151  0.323024  0.309278  0.326460  0.285223  0.323024   
2022-04       1.0  0.323529  0.240196  0.299020  0.313725  0.274510  0.299020   
2022-05       1.0  0.340741  0.288889  0.325926  0.259259  0.340741  0.266667   

Cohort_Index        7         8         9         10        11        12  \
Cohort                                                                     
2022-01       0.312914  0.263245  0.319536  0.296358  0.279801  0.269868   
2022-02       0.346835  0.316456  0.336709  0.308861  0.326582  0.220253   
2022-03       0.250859  0.240550  0.312715  0.353952  0.213

## Customer Lifetime Value (CLV) Estimation

This section estimates **Customer Lifetime Value (CLV)** using historical transaction data.
CLV helps identify high-value customers and supports strategic decisions such as
customer retention, personalized marketing, and resource allocation.

**Approach used:**
- Aggregate customer-level revenue and purchase frequency
- Estimate customer lifespan based on first and last purchase dates
- Compute CLV using a simplified historical-value formula

**Why this matters:**
- Enables prioritization of high-value customers
- Complements RFM segmentation and clustering results
- Provides a quantitative foundation for predictive modeling and business strategy


In [12]:
# ============================================================================
# 9. CUSTOMER LIFETIME VALUE (CLV) ESTIMATION
# ============================================================================

print("\n" + "=" * 80)
print("CUSTOMER LIFETIME VALUE (CLV) ANALYSIS")
print("=" * 80)

import os
os.makedirs("../data/processed", exist_ok=True)

# Aggregate customer-level metrics
customer_metrics = df.groupby("Customer_ID").agg({
    "Sales": "sum",
    "Order_ID": "count",
    "Order_Date": ["min", "max"]
}).reset_index()

customer_metrics.columns = [
    "Customer_ID", "Revenue", "Orders", "First", "Last"
]

# Customer lifespan (in years)
customer_metrics["Lifespan_Years"] = (
    (customer_metrics["Last"] - customer_metrics["First"]).dt.days / 365
).clip(lower=0.1)

# Simple CLV estimation
customer_metrics["CLV"] = (
    (customer_metrics["Revenue"] / customer_metrics["Orders"])
    * (customer_metrics["Orders"] / customer_metrics["Lifespan_Years"])
    * 3
)

# Save CLV data
clv_path = "../data/processed/customer_clv.csv"
customer_metrics.to_csv(clv_path, index=False)

print(f"CLV file saved successfully: {clv_path}")
print("CLV dataset shape:", customer_metrics.shape)




CUSTOMER LIFETIME VALUE (CLV) ANALYSIS
CLV file saved successfully: ../data/processed/customer_clv.csv
CLV dataset shape: (1986, 7)
