In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score


In [2]:
CSV_PATH = "spending_l9_dataset.csv"
df = pd.read_csv(CSV_PATH)
print("\n=== INITIAL ===")
print(df.head())


=== INITIAL ===
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  
0  Female   East  
1    Male  North  
2    Male  South  
3  Female   West  
4    Male   West  


In [3]:
# 2) Select features for clustering

FEATURES = ["Income_$", "SpendingScore"]
X = df[FEATURES].copy()

In [4]:
# Fill missing 

for col in FEATURES:
  if X[col].isna().any():
     X[col] = X[col].fillna(X[col].median())

In [5]:
print("\n=== FEATURES HEAD ===")
print(X.head())


=== FEATURES HEAD ===
   Income_$  SpendingScore
0        33             78
1        25             87
2        24             88
3        25             73
4        23             88


In [6]:
# 3) Scale features

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("\nScaled shape: ", X_scaled.shape)



Scaled shape:  (200, 2)


In [9]:
import os; os.environ["OMP_NUM_THREADS"] = "1"


In [13]:
# 4) Elbow method

print("\n=== ELBOW METHOD (SSE  k1-k11) ===")
for k in range(1, 11):
 km = KMeans(n_clusters=k, n_init=10, random_state=42)
 km.fit(X_scaled)
 print(f"k={k} -> SSE={km.inertia_:.2f}")


=== ELBOW METHOD (SSE  k1-k11) ===
k=1 -> SSE=400.00
k=2 -> SSE=199.70
k=3 -> SSE=77.01
k=4 -> SSE=21.37
k=5 -> SSE=17.93
k=6 -> SSE=15.65




k=7 -> SSE=13.88
k=8 -> SSE=12.45
k=9 -> SSE=11.06
k=10 -> SSE=9.93




In [14]:
# 5) Fit K-Means with best k

K = 4  
kmeans = KMeans(n_clusters=K, n_init=10, random_state=42)
labels = kmeans.fit_predict(X_scaled)

df["Cluster"] = labels.astype(int)
print("\n=== SAMPLE WITH CLUSTERS ===")
print(df.head())



=== SAMPLE WITH CLUSTERS ===
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  Cluster  
0  Female   East        2  
1    Male  North        2  
2    Male  South        2  
3  Female   West        2  
4    Male   West        2  




In [16]:
sil = silhouette_score(X_scaled, labels)
dbi = davies_bouldin_score(X_scaled, labels)
print("\n=== METRICS ===")
print(f"Silhouette Score : {sil:.3f} (closer to +1 is better)")
print(f"Davies–Bouldin   : {dbi:.3f} (lower is better)")


=== METRICS ===
Silhouette Score : 0.729 (closer to +1 is better)
Davies–Bouldin   : 0.387 (lower is better)


In [17]:
# Cluster centers (original units)

centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)

centers_df = pd.DataFrame(centers_original, columns=FEATURES)
centers_df.index.name = "Cluster"

print("\n=== CLUSTER CENTERS ===")
print(centers_df.round(2))


=== CLUSTER CENTERS ===
         Income_$  SpendingScore
Cluster                         
0           56.32          53.58
1           28.92          19.60
2           24.14          83.10
3           99.16          79.24


In [18]:
sample_idx = [6,100, 190]  # pick any three
sanity = df.loc[sample_idx, FEATURES + ["Cluster"]]
print("\n=== SANITY CHECK (3 Customers) ===")
print(sanity)


=== SANITY CHECK (3 Customers) ===
     Income_$  SpendingScore  Cluster
6          16             96        2
100        32             20        1
190       104             77        3


In [19]:
OUT_PATH = "spending_labeled_clusters.csv"
df.to_csv(OUT_PATH, index=False)
print(f"\nSaved clustered dataset -> {OUT_PATH}")


Saved clustered dataset -> spending_labeled_clusters.csv
