In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

sns.set_theme(style="whitegrid")

In [None]:
path = Path('data/orders.csv')
if not path.exists():
    raise FileNotFoundError(f"Missing {path}. Run: python scripts/generate_demo_datasets.py (from repo root)")
orders = pd.read_csv(path)
orders.head()

In [None]:
orders['order_date'] = pd.to_datetime(orders['order_date'])
analysis_date = orders['order_date'].max() + pd.Timedelta(days=1)
analysis_date

## Build RFM table

In [None]:
rfm = (
    orders.groupby('customer_id').agg(
        recency_days=('order_date', lambda s: int((analysis_date - s.max()).days)),
        frequency=('order_id', 'nunique'),
        monetary=('revenue', 'sum'),
    )
    .reset_index()
)
rfm.head()

In [None]:
rfm[['recency_days','frequency','monetary']].describe().T

## Simple segmentation with K-Means
We scale RFM features and cluster customers into 4 segments.

In [None]:
X = rfm[['recency_days','frequency','monetary']].copy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

k = 4
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
rfm['segment'] = kmeans.fit_predict(X_scaled)
rfm.head()

In [None]:
seg_profile = (
    rfm.groupby('segment')[['recency_days', 'frequency', 'monetary']]
    .agg(['mean', 'median', 'count'])
    .sort_values(('monetary', 'mean'), ascending=False)
 )
seg_profile

In [None]:
plt.figure(figsize=(7,4))
sns.boxplot(data=rfm, x='segment', y='monetary')
plt.title('Monetary value by segment')
plt.ylabel('Total revenue')
plt.show()