In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

np.random.seed(42)


In [2]:
n = 6000

# Synthetic behavioral signals
sessions_3m = np.random.gamma(shape=2.0, scale=10.0, size=n).astype(int)
product_views_3m = (sessions_3m * np.random.uniform(4, 12, size=n)).astype(int)
add_to_cart_3m = (product_views_3m * np.random.beta(2, 10, size=n)).astype(int)
add_to_view_ratio = np.clip(add_to_cart_3m / np.maximum(product_views_3m, 1), 0, 1)

# Synthetic transactional signals
orders_12m = np.random.poisson(lam=2.2, size=n)
orders_12m = np.clip(orders_12m, 0, 25)

avg_order_value = np.random.lognormal(mean=3.7, sigma=0.5, size=n)  # skewed like real AOV
revenue_12m = orders_12m * avg_order_value

# Margin as a fraction of revenue (varies across users)
margin_rate = np.random.normal(loc=0.18, scale=0.06, size=n)
margin_rate = np.clip(margin_rate, 0.02, 0.40)
margin_12m = revenue_12m * margin_rate

# Recency: high-value users often more recent, but not always (adds realism)
recency_days = np.random.exponential(scale=45, size=n).astype(int)
recency_days = np.clip(recency_days, 0, 365)

df = pd.DataFrame({
    "sessions_3m": sessions_3m,
    "product_views_3m": product_views_3m,
    "add_to_cart_3m": add_to_cart_3m,
    "add_to_view_ratio": add_to_view_ratio,
    "orders_12m": orders_12m,
    "revenue_12m": revenue_12m,
    "margin_12m": margin_12m,
    "recency_days": recency_days
})

df.head()


Unnamed: 0,sessions_3m,product_views_3m,add_to_cart_3m,add_to_view_ratio,orders_12m,revenue_12m,margin_12m,recency_days
0,23,227,65,0.286344,2,135.687206,21.060475,28
1,14,70,8,0.114286,2,29.891517,7.28064,89
2,13,105,2,0.019048,1,32.877436,5.456396,47
3,13,81,16,0.197531,3,146.314676,21.0551,52
4,46,243,12,0.049383,2,49.106528,7.257216,8


In [3]:
features = [
    "sessions_3m", "product_views_3m", "add_to_cart_3m", "add_to_view_ratio",
    "orders_12m", "revenue_12m", "margin_12m", "recency_days"
]

X = df[features].copy()

# Log-transform skewed monetary features (typical in commerce data)
for col in ["revenue_12m", "margin_12m", "product_views_3m", "add_to_cart_3m", "sessions_3m"]:
    X[col] = np.log1p(X[col])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [4]:
scores = []
k_values = list(range(3, 9))

for k in k_values:
    model = KMeans(n_clusters=k, n_init=10, random_state=42)
    labels = model.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    scores.append(score)

pd.DataFrame({"k": k_values, "silhouette": scores}).sort_values("silhouette", ascending=False)


[WinError 2] The system cannot find the file specified
  File "C:\Users\p.mostafavi\AppData\Local\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\p.mostafavi\AppData\Local\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\p.mostafavi\AppData\Local\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\p.mostafavi\AppData\Local\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Unnamed: 0,k,silhouette
0,3,0.251763
3,6,0.199701
1,4,0.198046
4,7,0.193652
2,5,0.192829
5,8,0.189277


In [5]:
best_k = k_values[int(np.argmax(scores))]
best_k


3

In [6]:
kmeans = KMeans(n_clusters=best_k, n_init=20, random_state=42)
df["cluster"] = kmeans.fit_predict(X_scaled)

df["cluster"].value_counts().sort_index()


cluster
0    3199
1     698
2    2103
Name: count, dtype: int64

In [7]:
profile = df.groupby("cluster")[features].agg(["mean", "median"])
profile


Unnamed: 0_level_0,sessions_3m,sessions_3m,product_views_3m,product_views_3m,add_to_cart_3m,add_to_cart_3m,add_to_view_ratio,add_to_view_ratio,orders_12m,orders_12m,revenue_12m,revenue_12m,margin_12m,margin_12m,recency_days,recency_days
Unnamed: 0_level_1,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median,mean,median
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,26.701157,24.0,219.363239,187.0,37.899344,29.0,0.185677,0.169697,2.52954,2.0,116.004183,90.276605,21.096176,15.307135,45.197874,31.0
1,20.352436,18.0,162.719198,129.5,26.163324,16.0,0.156846,0.141678,0.060172,0.0,0.912574,0.0,0.109901,0.0,45.535817,30.0
2,8.613409,8.0,61.977175,57.0,6.29767,6.0,0.117205,0.1,2.445554,2.0,111.97394,86.176143,20.21094,14.913437,43.810271,31.0


In [8]:
summary = df.groupby("cluster").agg(
    users=("cluster", "count"),
    orders_12m=("orders_12m", "mean"),
    revenue_12m=("revenue_12m", "mean"),
    margin_12m=("margin_12m", "mean"),
    recency_days=("recency_days", "mean"),
    sessions_3m=("sessions_3m", "mean"),
    add_to_view_ratio=("add_to_view_ratio", "mean")
).sort_values("margin_12m", ascending=False)

summary


Unnamed: 0_level_0,users,orders_12m,revenue_12m,margin_12m,recency_days,sessions_3m,add_to_view_ratio
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,3199,2.52954,116.004183,21.096176,45.197874,26.701157,0.185677
2,2103,2.445554,111.97394,20.21094,43.810271,8.613409,0.117205
1,698,0.060172,0.912574,0.109901,45.535817,20.352436,0.156846


In [9]:
def label_segment(row):
    if row["orders_12m"] >= 5 and row["recency_days"] <= 30 and row["margin_12m"] >= summary["margin_12m"].median():
        return "High Value Active"
    if row["orders_12m"] >= 3 and row["recency_days"] > 60:
        return "High Value At Risk"
    if row["orders_12m"] <= 1 and row["sessions_3m"] >= np.percentile(df["sessions_3m"], 70):
        return "Engaged Non-Buyer"
    if row["orders_12m"] == 0 and row["sessions_3m"] <= np.percentile(df["sessions_3m"], 30):
        return "Low Engagement"
    return "Mid Value / Growth Potential"

segment_map = summary.apply(label_segment, axis=1).to_dict()
segment_map


{0: 'Mid Value / Growth Potential',
 2: 'Mid Value / Growth Potential',
 1: 'Mid Value / Growth Potential'}

In [10]:
df["segment_label"] = df["cluster"].map(segment_map)
df["segment_label"].value_counts()


segment_label
Mid Value / Growth Potential    6000
Name: count, dtype: int64

In [11]:
exec_view = df.groupby("segment_label").agg(
    users=("segment_label", "count"),
    avg_orders=("orders_12m", "mean"),
    avg_revenue=("revenue_12m", "mean"),
    avg_margin=("margin_12m", "mean"),
    avg_recency=("recency_days", "mean"),
    avg_engagement=("sessions_3m", "mean")
).sort_values("avg_margin", ascending=False)

exec_view


Unnamed: 0_level_0,users,avg_orders,avg_revenue,avg_margin,avg_recency,avg_engagement
segment_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Mid Value / Growth Potential,6000,2.212833,101.202593,18.344497,44.750833,19.622833
