In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [2]:
# load data
df = pd.read_csv("spending_l9_dataset.csv")
# print(df.head(5))

In [3]:
FEATURES = ["Age", "Income_$", "SpendingScore"]
X = df[FEATURES].copy()
# print(df.columns)


In [4]:
# print("before: ", X_scaled)
# missing values
for col in  FEATURES:
    if X[col].isna().any():
        X[col] = X[col].fillna(X[col].median())
    # print(X.head(10))    

In [5]:
# scalling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# print("after: ", X_scaled)

In [None]:
# Elbow Method
for k in range(1,14):
    km = KMeans(n_clusters=k, n_init="auto", random_state=42)
    km.fit(X_scaled)
    # print(f"K = {k}  -> SSE={km.inertia_:.3f}")




In [7]:

k = 4
kmeans = KMeans(n_clusters=4, random_state=42)
labels = kmeans.fit_predict(X_scaled)

df["Cluster"] = labels.astype(int)
print("Type cluster")
# print(df.head(10))


Type cluster




In [8]:

sil = silhouette_score(X_scaled, labels)
db = davies_bouldin_score(X_scaled, labels)

print(f"\n== silhouette_score: {sil:.3f} closer better +1")
print(f"\n== silhouette_score: {db:.3f} lower better -1 ")


== silhouette_score: 0.697 closer better +1

== silhouette_score: 0.446 lower better -1 


In [9]:
# cluster centers 
centers_scaled = kmeans.cluster_centers_
centers_origin = scaler.inverse_transform(centers_scaled)

centers_df = pd.DataFrame(centers_origin, columns=FEATURES)

print("\n=== CLUSTER CENTERS (Original Units) ===")
print(centers_df.round(2))




=== CLUSTER CENTERS (Original Units) ===
     Age  Income_$  SpendingScore
0  33.66     56.32          53.58
1  56.58     28.92          19.60
2  22.38     24.14          83.10
3  37.40     99.16          79.24


In [10]:
# Sanity check

smple_index = [0,1,2]
sanity = df.loc[smple_index, FEATURES + ["Cluster"]]

print("\n== sanity check 3 index")
print(sanity)


== sanity check 3 index
   Age  Income_$  SpendingScore  Cluster
0   28        33             78        2
1   21        25             87        2
2   23        24             88        2


In [11]:
df.to_csv("spending_labeled_dataset.csv")

print("saving data")

saving data
