In [1]:
# ===============================
# Customer Segmentation (Clustering)
# - K-Means + Clustering Metrics
# ===============================
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [3]:
# --------------------------------
# 1) Load dataset
# --------------------------------
CSV_PATH = "spending_l9_dataset.csv"
df = pd.read_csv(CSV_PATH)
print("\n=== INITIAL SNAPSHOT ===")
print(df.head())
print("\n=== DATA INFO ===")
print(df.info())
print("\n=== DATA DESCRIPTION ===")
print(df.describe())


=== INITIAL SNAPSHOT ===
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  
0  Female   East  
1    Male  North  
2    Male  South  
3  Female   West  
4    Male   West  

=== DATA INFO ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CustomerID       200 non-null    int64 
 1   Age              200 non-null    int64 
 2   Income_$         200 non-null    int64 
 3   SpendingScore    200 non-n

In [5]:
# --------------------------------
# 2) Select features for clustering
# --------------------------------
Features = ['Age','Income_$', 'SpendingScore']
X = df[Features].copy()
print("\n=== FEATURES SNAPSHOT ===")
print(X.head())


=== FEATURES SNAPSHOT ===
   Age  Income_$  SpendingScore
0   28        33             78
1   21        25             87
2   23        24             88
3   24        25             73
4   20        23             88


In [9]:
# Fill missing numeric values (if any) with median
for column in Features:
    if X[column].isna().any():
        median_value = X[column].median()
        X[column].fillna(median_value)
print("\n=== FEATURES HEAD ===")
print(X.head())        


=== FEATURES HEAD ===
   Age  Income_$  SpendingScore
0   28        33             78
1   21        25             87
2   23        24             88
3   24        25             73
4   20        23             88


In [13]:
# --------------------------------
# 3) Scale features
# --------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("\nScaled shape:", X_scaled.shape)


Scaled shape: (200, 3)


In [14]:
# --------------------------------
# 4) Elbow method (print SSE)
# --------------------------------
print("\n=== ELBOW METHOD (SSE per k) ===")
for k in range(1, 11):
    km = KMeans(n_clusters=k, n_init="auto", random_state=42)
    km.fit(X_scaled)
    print(f"k={k} → SSE={km.inertia_:.2f}")


=== ELBOW METHOD (SSE per k) ===
k=1 → SSE=600.00
k=2 → SSE=263.82
k=3 → SSE=112.13
k=4 → SSE=36.61
k=5 → SSE=34.30
k=6 → SSE=30.95
k=7 → SSE=27.52
k=8 → SSE=25.15
k=9 → SSE=23.59
k=10 → SSE=21.29


In [29]:
# --------------------------------
# 5) Fit K-Means with chosen k
# --------------------------------
K = 4  # adjust based on SSE values above
kmeans = KMeans(n_clusters=K, n_init="auto", random_state=42)
labels = kmeans.fit_predict(X_scaled)
df["Cluster"] = labels.astype(int)
print("\n=== CLUSTER COUNTS ===")
print(df.head())


=== CLUSTER COUNTS ===
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  Cluster  
0  Female   East        2  
1    Male  North        2  
2    Male  South        2  
3  Female   West        2  
4    Male   West        2  


In [30]:
# --------------------------------
# 6) Evaluate clustering
# --------------------------------
sil = silhouette_score(X_scaled, labels)
dbi = davies_bouldin_score(X_scaled, labels)
print("\n=== METRICS ===")
print(f"Silhouette Score : {sil:.3f} (closer to +1 is better)")
print(f"Davies–Bouldin   : {dbi:.3f} (lower is better)")


=== METRICS ===
Silhouette Score : 0.697 (closer to +1 is better)
Davies–Bouldin   : 0.446 (lower is better)


In [31]:
# --------------------------------
# 7) Cluster centers (back to original units)
# --------------------------------
centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)

centers_df = pd.DataFrame(centers_original, columns=Features)
centers_df.index.name = "Cluster"

print("\n=== CLUSTER CENTERS (Original Units) ===")
print(centers_df.round(2))


=== CLUSTER CENTERS (Original Units) ===
           Age  Income_$  SpendingScore
Cluster                                
0        33.66     56.32          53.58
1        56.58     28.92          19.60
2        22.38     24.14          83.10
3        37.40     99.16          79.24


In [32]:
# --------------------------------
# 8) Sanity checks (3 customers)
# --------------------------------
sample_idx = [0, 1, 2]  # change to any row numbers
sanity = df.loc[sample_idx, Features + ["Cluster"]]
print("\n=== SANITY CHECK (3 Customers) ===")
print(sanity)



=== SANITY CHECK (3 Customers) ===
   Age  Income_$  SpendingScore  Cluster
0   28        33             78        2
1   21        25             87        2
2   23        24             88        2


In [33]:
# --------------------------------
# 9) Save labeled dataset
# --------------------------------
OUT_PATH = "customers_labeled_clusters.csv"
df.to_csv(OUT_PATH, index=False)
print(f"\nSaved clustered dataset → {OUT_PATH}")


Saved clustered dataset → customers_labeled_clusters.csv
