In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [13]:
CSV_PATH = "../../../dataset/spending_l9_dataset.csv"
df = pd.read_csv(CSV_PATH)
print(df.head().isnull().sum())

CustomerID         0
Age                0
Income_$           0
SpendingScore      0
VisitsPerMonth     0
OnlinePurchases    0
Gender             0
Region             0
dtype: int64


In [15]:
FEATURES = ["Income_$", "SpendingScore"]
X = df[FEATURES]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled)

[[-0.62783049  0.72998073]
 [-0.89031514  1.07359091]
 [-0.92312573  1.11176982]
 [-0.89031514  0.53908619]
 [-0.95593631  1.11176982]
 [-0.98874689  0.80633855]
 [-1.18561038  1.41720108]
 [-0.85750456  1.035412  ]
 [-0.95593631  1.4935589 ]
 [-0.75907282  0.84451746]
 [-0.98874689  0.88269636]
 [-1.25123155  1.07359091]
 [-0.95593631  1.30266436]
 [-0.66064107  1.14994872]
 [-0.98874689  0.61544401]
 [-1.08717864  1.035412  ]
 [-1.11998922  0.99723309]
 [-1.11998922  0.84451746]
 [-0.98874689  0.76815964]
 [-1.11998922  1.4935589 ]
 [-0.69345165  0.84451746]
 [-0.98874689  0.72998073]
 [-0.92312573  1.18812763]
 [-0.69345165  1.035412  ]
 [-0.69345165  0.95905418]
 [-0.98874689  1.22630654]
 [-0.89031514  1.11176982]
 [-0.82469398  1.07359091]
 [-0.95593631  1.26448545]
 [-1.21842096  1.07359091]
 [-0.82469398  0.80633855]
 [-0.7918834   0.88269636]
 [-0.85750456  0.80633855]
 [-1.08717864  1.035412  ]
 [-0.95593631  0.65362291]
 [-1.02155747  0.42454946]
 [-0.98874689  0.99723309]
 

In [16]:
for k in range(1, 11):
    km = KMeans(n_clusters=k, n_init="auto", random_state=42)
    km.fit(X_scaled)
    print(f"k={k} → SSE={km.inertia_:.2f}")

k=1 → SSE=400.00
k=2 → SSE=199.70
k=3 → SSE=79.37
k=4 → SSE=21.37
k=5 → SSE=19.09
k=6 → SSE=15.65
k=7 → SSE=14.48
k=8 → SSE=13.81
k=9 → SSE=12.94
k=10 → SSE=11.52


In [18]:
K = 4
kmeans = KMeans(n_clusters=K, n_init="auto", random_state=42)
labels = kmeans.fit_predict(X_scaled)

df["Cluster"] = labels.astype(int)
print(df['Cluster'])

0      2
1      2
2      2
3      2
4      2
      ..
195    3
196    3
197    3
198    3
199    3
Name: Cluster, Length: 200, dtype: int64


In [None]:
sil = silhouette_score(X_scaled, labels)
dbi = davies_bouldin_score(X_scaled, labels)
print(f"Silhouette Score : {sil:.3f} (closer to +1 is better)")
print(f"Davies–Bouldin   : {dbi:.3f} (lower is better)")


=== METRICS ===
Silhouette Score : 0.729 (closer to +1 is better)
Davies–Bouldin   : 0.387 (lower is better)


In [20]:
centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)

centers_df = pd.DataFrame(centers_original, columns=FEATURES)
centers_df.index.name = "Cluster"
print(centers_df.round(2))

         Income_$  SpendingScore
Cluster                         
0           56.32          53.58
1           28.92          19.60
2           24.14          83.10
3           99.16          79.24


In [26]:
sample_idx = [0, 54, 112]
sanity = df.loc[sample_idx, FEATURES + ["Cluster"]]
print(sanity)

     Income_$  SpendingScore  Cluster
0          33             78        2
54         65             54        0
112        26             18        1


In [27]:
OUT_PATH = "spending_labeled_clusters.csv"
df.to_csv(OUT_PATH, index=False)