In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [3]:
df = pd.read_csv("spending_l9_dataset.csv")
df.head()

Unnamed: 0,CustomerID,Age,Income_$,SpendingScore,VisitsPerMonth,OnlinePurchases,Gender,Region
0,1,28,33,78,14,9,Female,East
1,2,21,25,87,8,23,Male,North
2,3,23,24,88,13,10,Male,South
3,4,24,25,73,16,11,Female,West
4,5,20,23,88,17,16,Male,West


In [4]:
X = df[["Income_$", "SpendingScore"]].copy()

X = X.fillna(X.median(numeric_only=True))

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
print("=== SSE for k=1..10 ===")
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    sse.append(kmeans.inertia_)
    print(f"k={k} -> SSE={kmeans.inertia_:.4f}")

=== SSE for k=1..10 ===




k=1 -> SSE=400.0000
k=2 -> SSE=199.6976
k=3 -> SSE=77.0130
k=4 -> SSE=21.3715




k=5 -> SSE=17.9329
k=6 -> SSE=15.6498
k=7 -> SSE=13.8810




k=8 -> SSE=12.4545
k=9 -> SSE=11.0618
k=10 -> SSE=9.9340




In [6]:
best_k = None
best_score = -1

print("\n=== Silhouette Scores (k=2..5) ===")
for k in range(2, 6):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    sil = silhouette_score(X_scaled, labels)
    print(f"k={k} -> silhouette={sil:.4f}")
    if sil > best_score:
        best_score = sil
        best_k = k

print(f"\nChosen K (best silhouette in 2..5) = {best_k}")



=== Silhouette Scores (k=2..5) ===
k=2 -> silhouette=0.5122




k=3 -> silhouette=0.6109
k=4 -> silhouette=0.7286
k=5 -> silhouette=0.6504

Chosen K (best silhouette in 2..5) = 4




In [7]:
final_kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
labels = final_kmeans.fit_predict(X_scaled)

df["Cluster"] = labels




In [8]:
silhouette = silhouette_score(X_scaled, labels)
dbi = davies_bouldin_score(X_scaled, labels)

print(f"Silhouette Score : {silhouette:.3f}")
print(f"Davies–Bouldin   : {dbi:.3f}")


Silhouette Score : 0.729
Davies–Bouldin   : 0.387


In [9]:
centers_scaled = final_kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)
centers_df = pd.DataFrame(
    centers_original,
    columns=["Income_$", "SpendingScore"]
).round(2)
centers_df.index.name = "Cluster"

print("\n=== CLUSTER CENTERS (Original Units) ===")
print(centers_df)



=== CLUSTER CENTERS (Original Units) ===
         Income_$  SpendingScore
Cluster                         
0           56.32          53.58
1           28.92          19.60
2           24.14          83.10
3           99.16          79.24


In [10]:
print("\n=== SANITY CHECK SAMPLE ROWS ===")
print(df.loc[[35, 146, 196], ["CustomerID", "Income_$", "SpendingScore", "Cluster"]])



=== SANITY CHECK SAMPLE ROWS ===
     CustomerID  Income_$  SpendingScore  Cluster
35           36        21             70        2
146         147        31             22        1
196         197       115             87        3


In [11]:
df.to_csv("spending_labeled_clusters.csv", index=False)
print("\nSaved labeled dataset to: spending_labeled_clusters.csv")



Saved labeled dataset to: spending_labeled_clusters.csv
