In [5]:
import  pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [6]:
df = pd.read_csv('spending_l9_dataset.csv')
df.head()

Unnamed: 0,CustomerID,Age,Income_$,SpendingScore,VisitsPerMonth,OnlinePurchases,Gender,Region
0,1,28,33,78,14,9,Female,East
1,2,21,25,87,8,23,Male,North
2,3,23,24,88,13,10,Male,South
3,4,24,25,73,16,11,Female,West
4,5,20,23,88,17,16,Male,West


In [7]:
FEATURES = ['Income_$', 'SpendingScore']
X = df[FEATURES].copy()

In [9]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
for k in range(1, 10):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(X_scaled)
    print(f'k = {k} -> SSE = {km.inertia_:.2f}')

k = 1 -> SSE = 400.00
k = 2 -> SSE = 199.70
k = 3 -> SSE = 79.37
k = 4 -> SSE = 21.37
k = 5 -> SSE = 19.09
k = 6 -> SSE = 15.65
k = 7 -> SSE = 14.48
k = 8 -> SSE = 13.81
k = 9 -> SSE = 12.94


In [11]:
k = 4
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_scaled)

In [27]:
df['cluster'] = labels.astype(int)
print("\n------------------SAMPLE WITH CLUSTERS------------------")
df.head(50)


------------------SAMPLE WITH CLUSTERS------------------


Unnamed: 0,CustomerID,Age,Income_$,SpendingScore,VisitsPerMonth,OnlinePurchases,Gender,Region,cluster
0,1,28,33,78,14,9,Female,East,2
1,2,21,25,87,8,23,Male,North,2
2,3,23,24,88,13,10,Male,South,2
3,4,24,25,73,16,11,Female,West,2
4,5,20,23,88,17,16,Male,West,2
5,6,23,22,80,19,21,Female,West,2
6,7,22,16,96,9,19,Female,West,2
7,8,18,26,86,16,22,Female,West,2
8,9,26,23,98,17,15,Male,South,2
9,10,24,29,81,16,17,Male,West,2


In [18]:
shs = silhouette_score(X_scaled, labels)
dbi = davies_bouldin_score(X_scaled, labels)
print("\n------------------METRICS-------------------")
print(f"Silhouette Score : {shs:.3f} (closer to +1 is better)")
print(f"Davies–Bouldin   : {dbi:.3f} (lower is better)")


------------------METRICS-------------------
Silhouette Score : 0.729 (closer to +1 is better)
Davies–Bouldin   : 0.387 (lower is better)


In [21]:
centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)
df_centers = pd.DataFrame(centers_original, columns=FEATURES)
df_centers.index.name = 'Cluster'

In [22]:
print("\n------------------CLUSTER CENTERS (Original Units)------------------")
df_centers.round(2)


------------------CLUSTER CENTERS (Original Units)------------------


Unnamed: 0_level_0,Income_$,SpendingScore
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,56.32,53.58
1,28.92,19.6
2,24.14,83.1
3,99.16,79.24


In [32]:
sample_idx = [4, 50, 100]  
sanity = df.loc[sample_idx, FEATURES + ["cluster"]]
print("\n------------------3 SANITY CHECKS------------------")
print(sanity)


------------------3 SANITY CHECKS------------------
     Income_$  SpendingScore  cluster
4          23             88        2
50         60             73        0
100        32             20        1


In [33]:
df.to_csv('spending_labeled_clusters.csv', index=False)