In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
CSV_PATH = "spending_l9_dataset.csv" # update if needed
df = pd.read_csv(CSV_PATH)
print("=== DATA SHAPE ===", df.shape)
print(df.head())

=== DATA SHAPE === (200, 8)
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  
0  Female   East  
1    Male  North  
2    Male  South  
3  Female   West  
4    Male   West  


In [7]:
FEATURES = ["Income_$", "SpendingScore"]
# Defensive check: ensure the columns exist
missing_cols = [c for c in FEATURES if c not in df.columns]
if missing_cols:
    raise ValueError(f"Missing expected feature columns in CSV: {missing_cols}")

In [15]:
# Fill numeric missing values with median
X = df[FEATURES].copy()
for col in FEATURES:
    if X[col].isna().any():
        X[col] = X[col].fillna(X[col].median())

print("\n=== FEATURES HEAD ===")
print(X.head())
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("\nScaled features shape:", X_scaled.shape)


=== FEATURES HEAD ===
   Income_$  SpendingScore
0        33             78
1        25             87
2        24             88
3        25             73
4        23             88

Scaled features shape: (200, 2)


In [16]:
K = 3 # adjust this number based on the elbow SSE values
print(f"Using K = {K} for final clustering\n")


kmeans = KMeans(n_clusters=K, n_init=10, random_state=42)
labels = kmeans.fit_predict(X_scaled)


df = df.copy()
df['Cluster'] = labels.astype(int)


print("Cluster counts:\n", df['Cluster'].value_counts().sort_index())

Using K = 3 for final clustering

Cluster counts:
 Cluster
0    89
1    58
2    53
Name: count, dtype: int64


In [17]:
sil = silhouette_score(X_scaled, labels) if K > 1 else np.nan
dbi = davies_bouldin_score(X_scaled, labels) if K > 1 else np.nan
print("\n=== METRICS ===")
print(f"Silhouette Score : {sil:.3f} (closer to +1 is better)")
print(f"Davies–Bouldin : {dbi:.3f} (lower is better)")


=== METRICS ===
Silhouette Score : 0.611 (closer to +1 is better)
Davies–Bouldin : 0.584 (lower is better)


In [18]:
centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)
centers_df = pd.DataFrame(centers_original, columns=FEATURES)
centers_df.index.name = 'Cluster'
print("\n=== CLUSTER CENTERS (Original Units) ===")
print(centers_df.round(2))


=== CLUSTER CENTERS (Original Units) ===
         Income_$  SpendingScore
Cluster                         
0           37.81          71.19
1           32.43          22.45
2           97.75          78.08


In [19]:
sample_idx = [0, 1, 2] # you can change to any row numbers
sanity = df.loc[sample_idx, FEATURES + ['Cluster']]
print("\n=== SANITY CHECK (3 Customers) ===")
print(sanity)


=== SANITY CHECK (3 Customers) ===
   Income_$  SpendingScore  Cluster
0        33             78        0
1        25             87        0
2        24             88        0


In [22]:
OUT_PATH = "spending_labeled_clusters.csv"
df.to_csv(OUT_PATH, index=False)
print(f"\nSaved clustered dataset → {OUT_PATH}")


Saved clustered dataset → spending_labeled_clusters.csv
