In [9]:

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
import os
import warnings

In [3]:
# Data Load
df = pd.read_csv("spending_l9_dataset.csv")


In [4]:
print("\n=== INITIAL SNAPSHOT ===")
print(df.head())
print(df.info())
print(df.isnull().sum())


=== INITIAL SNAPSHOT ===
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  
0  Female   East  
1    Male  North  
2    Male  South  
3  Female   West  
4    Male   West  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CustomerID       200 non-null    int64 
 1   Age              200 non-null    int64 
 2   Income_$         200 non-null    int64 
 3   SpendingScore    200 non-null    int64 
 4   

In [5]:
# Prepare Features
FEATURES = ["Income_$", "SpendingScore"]
X = df[FEATURES].copy()
print("\n=== FEATURES HEAD ===")
print(X.head())


=== FEATURES HEAD ===
   Income_$  SpendingScore
0        33             78
1        25             87
2        24             88
3        25             73
4        23             88


In [6]:
# Handle any missing values via median (numeric only).
for col in FEATURES:
     if X[col].isna().any():
         X[col].fillna(X[col].median())
         

In [7]:
# scale features with StanderScle
scaler = StandardScaler()
X = scaler.fit_transform(X)


In [10]:
os.environ["LOKY_MAX_CPU_COUNT"] = "4"   # replace 4 with your CPU's physical cores

# Fix KMeans MKL memory leak warning
os.environ["OMP_NUM_THREADS"] = "1"

# Hide warnings completely
warnings.filterwarnings("ignore")

In [11]:

# Cluster using K-Means.    
# Use Elbow method to determine k (1 to 10, print SSE).
print("\n=== ELBOW METHOD (SSE per k) ===") 
for k in range(1, 11):
    km = KMeans(n_clusters=k, n_init="auto", random_state=42)
    km.fit(X)
    print(f"k={k} → SSE={km.inertia_:.2f}")


=== ELBOW METHOD (SSE per k) ===
k=1 → SSE=400.00
k=2 → SSE=199.70
k=3 → SSE=79.37
k=4 → SSE=21.37
k=5 → SSE=19.09
k=6 → SSE=15.65
k=7 → SSE=14.48
k=8 → SSE=13.81
k=9 → SSE=12.94
k=10 → SSE=11.52


In [12]:

# Cluster using K-Means.
K = 5  # adjust based on SSE values above
kmeans = KMeans(n_clusters=K, n_init="auto", random_state=42)
labels = kmeans.fit_predict(X)

df["Cluster"] = labels.astype(int)
print("\n=== SAMPLE WITH CLUSTERS ===") 
print(df.head())


=== SAMPLE WITH CLUSTERS ===
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  Cluster  
0  Female   East        2  
1    Male  North        4  
2    Male  South        4  
3  Female   West        2  
4    Male   West        4  


In [13]:
# Evaluate clustering.
sil = silhouette_score(X, labels)   
dbi = davies_bouldin_score(X, labels)   
print("\n=== METRICS ===")
print(f"Silhouette Score : {sil:.2f} (closer to +1 is better)")
print(f"Davies-Bouldin  : {dbi:.2f} (closer to 0 is better)")


=== METRICS ===
Silhouette Score : 0.64 (closer to +1 is better)
Davies-Bouldin  : 0.57 (closer to 0 is better)


In [14]:

#  Cluster centers (back to original units)

centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)

centers_df = pd.DataFrame(centers_original, columns=FEATURES)
centers_df.index.name = "Cluster"

print("\n=== CLUSTER CENTERS (Original Units) ===")
print(centers_df.round(2))


=== CLUSTER CENTERS (Original Units) ===
         Income_$  SpendingScore
Cluster                         
0           56.32          53.58
1           28.92          19.60
2           25.33          78.04
3           99.16          79.24
4           22.74          89.04


In [15]:
# Sanity Check

sample_idx = [0, 1, 2]  # change to any row numbers
sanity = df.loc[sample_idx, FEATURES + ["Cluster"]]
print("\n=== SANITY CHECK (3 Customers) ===")
print(sanity)


=== SANITY CHECK (3 Customers) ===
   Income_$  SpendingScore  Cluster
0        33             78        2
1        25             87        4
2        24             88        4


In [16]:
#  Save labeled dataset
OUT_PATH = "spending_labeled_clusters.csv"
df.to_csv(OUT_PATH, index=False)
print(f"\nSaved clustered dataset → {OUT_PATH}")


Saved clustered dataset → spending_labeled_clusters.csv
