In [35]:
import os
os.environ["OMP_NUM_THREADS"] = "1"

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score


In [3]:
CSV_PATH = "spending_l9_dataset.csv"
df = pd.read_csv(CSV_PATH)
print("The first 5 rows")
print(df.head())

The first 5 rows
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  
0  Female   East  
1    Male  North  
2    Male  South  
3  Female   West  
4    Male   West  


In [5]:
Features = ["Income_$", "SpendingScore"]
X = df[Features].copy()


In [7]:
#handling missing numeric values using medaian
for col in Features:
    if X[col].isna().any():
        X[col] = X[col].fillna(X[col].median())

In [31]:
#scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("\nScaled : ", X_scaled.shape)


Scaled :  (200, 2)


In [11]:
print("        ELBOW METHOD (SSE per k)         ")
for k in range(1, 11):
    km = KMeans(n_clusters=k, n_init="auto",random_state=42)
    km.fit(X_scaled)
    print(f"k={k} → SSE={km.inertia_:.2f}")

        ELBOW METHOD (SSE per k)         
k=1 → SSE=400.00
k=2 → SSE=199.70
k=3 → SSE=79.37
k=4 → SSE=21.37
k=5 → SSE=19.09
k=6 → SSE=15.65
k=7 → SSE=14.48
k=8 → SSE=13.81
k=9 → SSE=12.94
k=10 → SSE=11.52


In [29]:
# choose the k

k = 4 
kmeans = KMeans(n_clusters = k , n_init = "auto", random_state = 42)
labels =kmeans.fit_predict(X_scaled)


df["Cluster"] = labels.astype(int)
print("\n=== SAMPLE WITH CLUSTERS ===")
print(df.head())


=== SAMPLE WITH CLUSTERS ===
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  Cluster  
0  Female   East        2  
1    Male  North        2  
2    Male  South        2  
3  Female   West        2  
4    Male   West        2  


In [39]:
sil = silhouette_score(X_scaled, labels)
dbi = davies_bouldin_score(X_scaled, labels)
print("     METRICS     ")
print(f"Silhouette Score : {sil:.2f} (closer to +1 is better)")
print(f"Davies–Bouldin   : {dbi:.2f} (lower is better)")

     METRICS     
Silhouette Score : 0.73 (closer to +1 is better)
Davies–Bouldin   : 0.39 (lower is better)


In [53]:
centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)

centers_df = pd.DataFrame(centers_original, columns=Features)
centers_df.index.name = "Cluster"

print("  CLUSTER CENTERS (Original Units)")
print(centers_df.round(2))


  CLUSTER CENTERS (Original Units)
         Income_$  SpendingScore
Cluster                         
0           56.32          53.58
1           28.92          19.60
2           24.14          83.10
3           99.16          79.24


In [75]:
# sanity check

sample_idx = [30, 72, 106,56,178,28]  # change to any row numbers
sanity = df.loc[sample_idx, Features + ["Cluster"]]
print("   SANITY CHECK (6 Customers) ")
print(sanity)

   SANITY CHECK (6 Customers) 
     Income_$  SpendingScore  Cluster
30         27             80        2
72         50             55        0
106        31             26        1
56         57             38        0
178        92             81        3
28         23             92        2


In [79]:
# saving labeled cluster
OUT_PATH = "spending_labeled_clusters.csv"
df.to_csv(OUT_PATH, index=False)
