In [41]:
import warnings 
warnings.filterwarnings("ignore")

In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score


In [43]:
# --------------------------------
# 1) Load dataset
# --------------------------------
CSV_PATH = "customers_l9_dataset.csv"
df = pd.read_csv(CSV_PATH)
print("\n=== INITIAL SNAPSHOT ===")
print(df.head())



=== INITIAL SNAPSHOT ===
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  
0  Female   East  
1    Male  North  
2    Male  South  
3  Female   West  
4    Male   West  


In [44]:
# Rename for consistency
df = df.rename(columns={"Annual Income ($)": "Income_$", "Spending Score (1-100)": "SpendingScore"})

In [45]:
# 2. Prepare Features
features = ["Income_$", "SpendingScore"]
X = df[features]


In [46]:
# Handle missing numeric values (median)
X = X.fillna(X.median())

In [47]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [48]:
# 3. Elbow Check (SSE)
print("=== ELBOW CHECK (SSE) ===")
sse = []
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    sse.append(kmeans.inertia_)
    print(f"k={k} -> SSE={kmeans.inertia_:.2f}")

=== ELBOW CHECK (SSE) ===
k=1 -> SSE=400.00
k=2 -> SSE=199.70
k=3 -> SSE=77.01
k=4 -> SSE=21.37
k=5 -> SSE=17.93
k=6 -> SSE=15.65
k=7 -> SSE=13.88
k=8 -> SSE=12.45
k=9 -> SSE=11.06


In [None]:
# 4. Choosen K
k = 4
print(f"\n=== TRAINING FINAL MODEL WITH K= ===")
kmeans = KMeans(n_clusters=4, random_state=42, n_init='auto')
labels = kmeans.fit_predict(X_scaled)



=== TRAINING FINAL MODEL WITH K= ===


In [50]:
# Add cluster column
df["Cluster"] = labels.astype(int)

In [51]:
# 5. Evaluate Clustering
sil_score = silhouette_score(X_scaled, labels)
dbi_score = davies_bouldin_score(X_scaled, labels)
print(f"\nSilhouette Score : {sil_score:.3f}")
print(f"Davies-Bouldin: {dbi_score:.3f}")



Silhouette Score : 0.729
Davies-Bouldin: 0.387


In [52]:
# 6. Cluster Centers (Original Units)
centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)
centers_df = pd.DataFrame(centers_original, columns=features)
centers_df.index.name = "Cluster"
print("\n=== CLUSTER CENTERS (Original Units) ===")
print(centers_df.round(2))


=== CLUSTER CENTERS (Original Units) ===
         Income_$  SpendingScore
Cluster                         
0           56.32          53.58
1           28.92          19.60
2           24.14          83.10
3           99.16          79.24


In [53]:
# 7. Sanity Check: Print 3 Sample Rows
print("\n=== SAMPLE ROWS WITH CLUSTERS ===")
print(df.sample(3)[["Income_$", "SpendingScore", "Cluster"]])


=== SAMPLE ROWS WITH CLUSTERS ===
     Income_$  SpendingScore  Cluster
111        24             26        1
71         59             43        0
140        27             21        1


In [54]:
# 8. Save Output
df.to_csv("spending_labeled_clusters.csv", index=False)
print(df)

     CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  \
0             1   28        33             78              14   
1             2   21        25             87               8   
2             3   23        24             88              13   
3             4   24        25             73              16   
4             5   20        23             88              17   
..          ...  ...       ...            ...             ...   
195         196   35       112             76              18   
196         197   40       115             87              17   
197         198   38        86             73              15   
198         199   36       105             72              10   
199         200   41       115             95              15   

     OnlinePurchases  Gender Region  Cluster  
0                  9  Female   East        2  
1                 23    Male  North        2  
2                 10    Male  South        2  
3                 11  Female   