In [13]:
import pandas as pd
import numpy as np
import os
os.environ["OMP_NUM_THREADS"] = "1"  # Limit to 1 thread


from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score

# --------------------------------
# 1) Load dataset
# --------------------------------

CSV_PATH = "spending_l9_dataset.csv"
df = pd.read_csv(CSV_PATH)

print("=== INITIAL SNAPSHOT ===")
print(df.head())
print(df.info())
print(df.isnull().sum())

=== INITIAL SNAPSHOT ===
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  
0  Female   East  
1    Male  North  
2    Male  South  
3  Female   West  
4    Male   West  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CustomerID       200 non-null    int64 
 1   Age              200 non-null    int64 
 2   Income_$         200 non-null    int64 
 3   SpendingScore    200 non-null    int64 
 4   V

In [3]:
# --------------------------------
# 2) Select features for clustering
# --------------------------------
FEATURES = ["SpendingScore", "Income_$"]
X = df[FEATURES].copy()

# Fill missing numeric values (if any) with median
for col in FEATURES:
    if X[col].isna().any():
        X[col] = X[col].fillna(X[col].median())
    else:
        print("no null values")
        break
print("\n === FEATURES HEAD === ")
print(X.head())
    

no null values

 === FEATURES HEAD === 
   SpendingScore  Income_$
0             78        33
1             87        25
2             88        24
3             73        25
4             88        23


In [4]:
# --------------------------------
# 3) Scale features
# --------------------------------
scaller = StandardScaler()
X_scalled = scaller.fit_transform(X)
print("\n Scalled shape:", X_scalled.shape)
print(X_scalled[: 5])


 Scalled shape: (200, 2)
[[ 0.72998073 -0.62783049]
 [ 1.07359091 -0.89031514]
 [ 1.11176982 -0.92312573]
 [ 0.53908619 -0.89031514]
 [ 1.11176982 -0.95593631]]


In [14]:
# --------------------------------
# 4) Elbow method (print SSE)
# --------------------------------

print("\n=== ELBOW METHOD (SSE per k) ===")
for k in range(1,11):
    km = KMeans(n_clusters = k, n_init = "auto", random_state = 42)
    km.fit(X_scalled)
    print(f"k = {k} SSE = {km.inertia_:.2f}")



=== ELBOW METHOD (SSE per k) ===
k = 1 SSE = 400.00
k = 2 SSE = 199.70
k = 3 SSE = 79.37
k = 4 SSE = 21.37
k = 5 SSE = 19.09
k = 6 SSE = 15.65
k = 7 SSE = 14.48
k = 8 SSE = 13.81
k = 9 SSE = 12.94
k = 10 SSE = 11.52




In [15]:
# --------------------------------
# 5) Fit K-Means with chosen k
# --------------------------------
K = 3  # adjust based on SSE values above
kmeans = KMeans(n_clusters=K, n_init="auto", random_state=42)
labels = kmeans.fit_predict(X_scalled)

df["Cluster"] = labels.astype(int)
print("\n=== SAMPLE WITH CLUSTERS ===")
print(df.head())



=== SAMPLE WITH CLUSTERS ===
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  Cluster  
0  Female   East        2  
1    Male  North        2  
2    Male  South        2  
3  Female   West        2  
4    Male   West        2  




In [8]:
# --------------------------------
# 6) Evaluate clustering
# --------------------------------
sil = silhouette_score(X_scalled, labels)
dbi = davies_bouldin_score(X_scalled, labels)
print("\n=== METRICS ===")
print(f"Silhouette Score : {sil:.3f} (closer to +1 is better)")
print(f"Davies–Bouldin   : {dbi:.3f} (lower is better)")


=== METRICS ===
Silhouette Score : 0.607 (closer to +1 is better)
Davies–Bouldin   : 0.554 (lower is better)


In [9]:
# --------------------------------
# 7) Cluster centers (back to original units)
# --------------------------------
centers_scaled = kmeans.cluster_centers_
centers_original = scaller.inverse_transform(centers_scaled)

centers_df = pd.DataFrame(centers_original, columns=FEATURES)
centers_df.index.name = "Cluster"

print("\n=== CLUSTER CENTERS (Original Units) ===")
print(centers_df.round(2))


=== CLUSTER CENTERS (Original Units) ===
         SpendingScore  Income_$
Cluster                         
0                76.72     95.51
1                31.85     39.32
2                80.14     28.47


In [11]:
# --------------------------------
# 8) Sanity checks (3 customers)
# --------------------------------
sample_idx = [0, 1, 2]  # change to any row numbers
sanity = df.loc[sample_idx, FEATURES + ["Cluster"]]
print("\n=== SANITY CHECK (3 Customers) ===")
print(sanity)


=== SANITY CHECK (3 Customers) ===
   SpendingScore  Income_$  Cluster
0             78        33        2
1             87        25        2
2             88        24        2


In [12]:
# --------------------------------
# 9) Save labeled dataset
# --------------------------------
OUT_PATH = "spending_labeled_clusters.csv"
df.to_csv(OUT_PATH, index=False)
print(f"\nSaved clustered dataset → {OUT_PATH}")


Saved clustered dataset → spending_labeled_clusters.csv
