In [None]:
import os
os.environ["OMP_NUM_THREADS"] = "1"  # Fix memory leak warning on Windows with MKL

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score , davies_bouldin_score


In [None]:
# ================================
# 1) Load dataset
# ================================

In [None]:
df = pd.read_csv("spending_l9_dataset.csv")

In [None]:
print(df.head(3))
print(df.shape)

   CustomerID  Gender  Age  Annual_Income_($)  Spending_Score_(1-100)
0           1    Male   19                 15                      39
1           2    Male   21                 15                      81
2           3  Female   20                 16                       6
(200, 5)


In [None]:
# ================================
# 2) Select useful features
# ================================

In [None]:
FEATURES = ["Annual_Income_($)", "Spending_Score_(1-100)"]

In [None]:
X = df[FEATURES].copy()

In [None]:
# checking missing values, fill  median
for col in FEATURES:
    if X[col].isnull().any():
       X[col] = X[col].fillna(X[col].median()) 

In [None]:
# ================================
# 3) Scale the features
# ================================

In [None]:
scaler = StandardScaler()

In [None]:
x_scaled = scaler.fit_transform(X)  # Standardization → mean=0, std=1

In [None]:
# ================================
# 4) Elbow Method (SSE values)
# ================================

In [None]:
for k in range(1,11):
    km = KMeans(n_clusters=k, random_state=42, n_init="auto")
    km.fit(x_scaled)
    print(f" k = {k} -> SSE = {km.inertia_:.2f}") #Inertia = SSE (sum of squared errors)

 k = 1 -> SSE = 400.00
 k = 2 -> SSE = 273.67
 k = 3 -> SSE = 157.70
 k = 4 -> SSE = 109.23
 k = 5 -> SSE = 65.57
 k = 6 -> SSE = 60.13
 k = 7 -> SSE = 49.67
 k = 8 -> SSE = 37.32
 k = 9 -> SSE = 32.50
 k = 10 -> SSE = 30.06


In [None]:
# ================================
# 5) Final Model with chosen K
# ================================

In [None]:
kmeans =KMeans(n_clusters=5, random_state=42)

In [None]:
labels = kmeans.fit_predict(x_scaled)

In [None]:
# Add cluster labels to DataFrame
df["Cluster"] = labels.astype(int)

In [None]:
# ================================
# 6) Evaluate clustering
# ================================

In [None]:
sil = silhouette_score(x_scaled,labels) # Higher = better separation


In [None]:
dbi = davies_bouldin_score(x_scaled,labels)  # Lower = better

In [None]:
print(f"silhouette score : {sil:.3f} (closer to +1 is better)")

silhouette score : 0.555 (closer to +1 is better)


In [None]:
print(f"davies  score : {dbi:.3f} (lower is better)")

davies  score : 0.572 (lower is better)


In [None]:
# ================================
# 7) Cluster centers in original units
# ================================

In [None]:
centers_scaled = kmeans.cluster_centers_

In [None]:
centers_original= scaler.inverse_transform(centers_scaled)

In [None]:
centers_df = pd.DataFrame(centers_original,columns=FEATURES)

In [None]:
print("back to original: ", centers_df.round(2))

back to original:     Annual_Income_($)  Spending_Score_(1-100)
0              55.30                   49.52
1              86.54                   82.13
2              25.73                   79.36
3              88.20                   17.11
4              26.30                   20.91


In [None]:
# ================================
# 8) Sanity check (few rows)
# ================================

In [None]:
simple_idx = [0,1,2]
sanity = df.loc[simple_idx, FEATURES + ["Cluster"]]
print(sanity)

   Annual_Income_($)  Spending_Score_(1-100)  Cluster
0                 15                      39        4
1                 15                      81        2
2                 16                       6        4


In [None]:
# ================================
# 9) Save labeled dataset
# ================================

In [None]:
out_path = "spending_labeled_clusters.csv"
df.to_csv(out_path,index=False)
print("data saved")

data saved
