In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [2]:
CSV_PATH = "dataset/spending_l9_dataset.csv"
df = pd.read_csv(CSV_PATH)
print(df.head())

   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  
0  Female   East  
1    Male  North  
2    Male  South  
3  Female   West  
4    Male   West  


In [6]:
FEATURES = ["Income_$", "SpendingScore"]
print(df[FEATURES].head())

   Income_$  SpendingScore
0        33             78
1        25             87
2        24             88
3        25             73
4        23             88


In [7]:
print(df[FEATURES].isnull().sum())

Income_$         0
SpendingScore    0
dtype: int64


In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[FEATURES])
print("\nScaled shape:", X_scaled.shape)


Scaled shape: (200, 2)


In [9]:
print("\n=== ELBOW METHOD (SSE per k) ===")
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df[FEATURES])
    sse = kmeans.inertia_
    print(f"k={k}, SSE={sse:.2f}")


=== ELBOW METHOD (SSE per k) ===
k=1, SSE=322990.47
k=2, SSE=146350.87
k=3, SSE=61634.38
k=4, SSE=16807.10
k=5, SSE=15218.93
k=6, SSE=12628.97
k=7, SSE=11712.73
k=8, SSE=10494.88
k=9, SSE=8947.34
k=10, SSE=8180.89


In [10]:
K = 4
kmeans = KMeans(n_clusters=K, n_init="auto", random_state=42)
labels = kmeans.fit_predict(X_scaled)

df["Cluster"] = labels.astype(int)
print(df.head())

   Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases Region  Cluster
0        33             78              14                9   East        2
1        25             87               8               23  North        2
2        24             88              13               10  South        2
3        25             73              16               11   West        2
4        23             88              17               16   West        2


In [11]:
sil = silhouette_score(X_scaled, labels)
dbi = davies_bouldin_score(X_scaled, labels)
print(f"\nSilhouette Score: {sil:.4f}")
print(f"Davies-Bouldin Index: {dbi:.4f}")


Silhouette Score: 0.7286
Davies-Bouldin Index: 0.3871


**Cluster centers**

In [12]:
centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)

print("\nCluster centers (original scale):")
centers_df = pd.DataFrame(centers_original, columns=FEATURES)
centers_df.index.name = "Cluster"

print(centers_df.round(2))


Cluster centers (original scale):
         Income_$  SpendingScore
Cluster                         
0           56.32          53.58
1           28.92          19.60
2           24.14          83.10
3           99.16          79.24


**Sanity checks**

In [13]:
sample_idx = [9, 30, 100, 110] 
sanity = df.loc[sample_idx, FEATURES + ["Cluster"]]

print("\nSample data points with cluster assignments:")
print(sanity)


Sample data points with cluster assignments:
     Income_$  SpendingScore  Cluster
9          29             81        2
30         27             80        2
100        32             20        1
110        34             21        1


In [15]:
#save to new CSV
import os
OUTPUT_CSV = "dataset/with_clusters_spending_l9_dataset.csv"
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
df.to_csv(OUTPUT_CSV, index=False)