In [28]:
import pandas as pd
import os
os.environ["OMP_NUM_THREADS"] = "1"
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score


CSV_PATH = "spending_l9_dataset.csv"
df = pd.read_csv(CSV_PATH)
print("INITIAL SNAPSHOT ")
print(df.head())


INITIAL SNAPSHOT 
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  
0  Female   East  
1    Male  North  
2    Male  South  
3  Female   West  
4    Male   West  


In [29]:

FEATURES = ["Income_$", "SpendingScore"]
X = df[FEATURES].copy()



In [30]:
for col in FEATURES:
    if X[col].isna().any():
        X[col] = X[col].fillna(X[col].median())


In [31]:
print(X.head())

   Income_$  SpendingScore
0        33             78
1        25             87
2        24             88
3        25             73
4        23             88


In [32]:
#Scale features

scaler=StandardScaler()
x_scaled=scaler.fit_transform(X)
print("scaled shape:", x_scaled.shape)

scaled shape: (200, 2)


In [33]:
#  Elbow method
print("== Elbow method (SSE per k) ===")
for k in range(1, 11):
    km = KMeans(n_clusters=k, n_init="auto", random_state=42)
    km.fit(x_scaled)
    print(f"k={k} → SSE={km.inertia_:.2f}")


== Elbow method (SSE per k) ===
k=1 → SSE=400.00
k=2 → SSE=199.70
k=3 → SSE=79.37
k=4 → SSE=21.37
k=5 → SSE=19.09
k=6 → SSE=15.65
k=7 → SSE=14.48
k=8 → SSE=13.81
k=9 → SSE=12.94
k=10 → SSE=11.52


In [35]:
# Fit K-Means with chosen 
K = 4
kmeans = KMeans(n_clusters=K, n_init="auto", random_state=42)
labels = kmeans.fit_predict(x_scaled)


In [36]:
df["Cluster"] = labels.astype(int)
print("sample clustering")
print(df.head(90))


sample clustering
    CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0            1   28        33             78              14                9   
1            2   21        25             87               8               23   
2            3   23        24             88              13               10   
3            4   24        25             73              16               11   
4            5   20        23             88              17               16   
..         ...  ...       ...            ...             ...              ...   
85          86   33        46             44               7                7   
86          87   35        61             54              11                7   
87          88   29        46             53              11                9   
88          89   28        57             57              10                8   
89          90   37        56             54               8               10   

    Gende

In [37]:
# evaluate cluster

sil = silhouette_score(x_scaled, labels)
dbi = davies_bouldin_score(x_scaled, labels)
print("METRICS")
print(f"Silhouette Score : {sil:.3f} (closer to +1 is better)")
print(f"Davies–Bouldin   : {dbi:.3f} (lower is better)")


METRICS
Silhouette Score : 0.729 (closer to +1 is better)
Davies–Bouldin   : 0.387 (lower is better)


In [38]:
#  Cluster centers back to original units

centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)

centers_df = pd.DataFrame(centers_original, columns=FEATURES)
centers_df.index.name = "Cluster"

print("cluster center (Original Units)")
print(centers_df.round(2))

cluster center (Original Units)
         Income_$  SpendingScore
Cluster                         
0           56.32          53.58
1           28.92          19.60
2           24.14          83.10
3           99.16          79.24


In [47]:

# sanity check
df["cluster"] = labels
sample_index=[9,79,2]
sanity=df.loc[sample_index,FEATURES +["cluster"]]
print("sanity check 3 sample")
print(sanity)

sanity check 3 sample
    Income_$  SpendingScore  cluster
9         29             81        2
79        56             58        0
2         24             88        2


In [39]:
#save
OUT_PATH = "spending_l9_dataset.csv"
df.to_csv(OUT_PATH, index=False)
print(f"spending_l9_dataset.csv → {OUT_PATH}")

spending_l9_dataset.csv → spending_l9_dataset.csv
