In [64]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [65]:
CSV_PATH = "dataset/customers_l9_dataset.csv"
df = pd.read_csv(CSV_PATH)
print(df.head())

   CustomerID  Gender  Age  Annual Income ($)  Spending Score (1-100)
0           1    Male   19                 15                      39
1           2    Male   21                 15                      81
2           3  Female   20                 16                       6
3           4  Female   23                 16                      77
4           5  Female   31                 17                      40


In [66]:
df = df.drop(columns=['CustomerID','Gender','Age'])
print(df.head())
FEATURES = df.columns.tolist()
print(FEATURES)

   Annual Income ($)  Spending Score (1-100)
0                 15                      39
1                 15                      81
2                 16                       6
3                 16                      77
4                 17                      40
['Annual Income ($)', 'Spending Score (1-100)']


In [67]:
print(df[FEATURES].isnull().sum())

Annual Income ($)         0
Spending Score (1-100)    0
dtype: int64


In [68]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[FEATURES])
print("\nScaled shape:", X_scaled.shape)


Scaled shape: (200, 2)


In [77]:
print("\n=== ELBOW METHOD (SSE per k) ===")
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df[FEATURES])
    sse = kmeans.inertia_
    print(f"k={k}, SSE={sse:.2f}")


=== ELBOW METHOD (SSE per k) ===
k=1, SSE=269981.28
k=2, SSE=183653.33
k=3, SSE=106348.37
k=4, SSE=73880.64
k=5, SSE=44448.46
k=6, SSE=40825.17
k=7, SSE=33642.58
k=8, SSE=26686.84
k=9, SSE=24766.47
k=10, SSE=23103.12


waxan qadan karnaa K=8 ama 9 sababtoo ah k= 8 iyo k=9 waxa udhexeyo farqiga labad Ee ugu yar Ee SSE oo ah ~1900 sidoo kale k= 9 iyo k=10 waxa udhexeyo farqiga ugu yar Ee SSE oo ah ~1600 
marka aniga waxaa Doortay **k=9** sababtoo ah Silhouette Score Oo udhaw qiimaha 1

In [81]:
K = 9
kmeans = KMeans(n_clusters=K, n_init="auto", random_state=42)
labels = kmeans.fit_predict(X_scaled)

df["Cluster"] = labels.astype(int)
print(df.head())

   Annual Income ($)  Spending Score (1-100)  Cluster
0                 15                      39        8
1                 15                      81        3
2                 16                       6        4
3                 16                      77        3
4                 17                      40        8


In [83]:
sil = silhouette_score(X_scaled, labels)
dbi = davies_bouldin_score(X_scaled, labels)
print(f"\nSilhouette Score: {sil:.4f}")
print(f"Davies-Bouldin Index: {dbi:.4f}")


Silhouette Score: 0.4567
Davies-Bouldin Index: 0.7588


**Cluster centers**

In [72]:
centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)

print("\nCluster centers (original scale):")
centers_df = pd.DataFrame(centers_original, columns=FEATURES)
centers_df.index.name = "Cluster"

print(centers_df.round(2))


Cluster centers (original scale):
         Annual Income ($)  Spending Score (1-100)
Cluster                                           
0                    55.30                   49.52
1                    86.54                   82.13
2                    25.73                   79.36
3                    88.20                   17.11
4                    26.30                   20.91


**Sanity checks**

In [73]:
sample_idx = [9, 30, 100, 110] 
sanity = df.loc[sample_idx, FEATURES + ["Cluster"]]

print("\nSample data points with cluster assignments:")
print(sanity)


Sample data points with cluster assignments:
     Annual Income ($)  Spending Score (1-100)  Cluster
9                   19                      72        2
30                  30                       4        4
100                 62                      41        0
110                 63                      52        0


In [74]:
#save to new CSV
import os
OUTPUT_CSV = "dataset/with_clusters_customers_l9_dataset.csv"
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)
df.to_csv(OUTPUT_CSV, index=False)