In [34]:
import os
os.environ["OMP_NUM_THREADS"] = "1"
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score,davies_bouldin_score

In [35]:
# 1. load dataset
df = pd.read_csv("spending_l9_dataset.csv")
df.head()

Unnamed: 0,CustomerID,Age,Income_$,SpendingScore,VisitsPerMonth,OnlinePurchases,Gender,Region
0,1,28,33,78,14,9,Female,East
1,2,21,25,87,8,23,Male,North
2,3,23,24,88,13,10,Male,South
3,4,24,25,73,16,11,Female,West
4,5,20,23,88,17,16,Male,West


In [126]:
# 2. Select features for clustering
FEATURES = ["Income_$","SpendingScore"]
X = df[FEATURES].copy()

# Fill missing numeric values (if any) with median
for col in FEATURES:
    if X[col].isna().any():
        X[col] = X[col].fillna(X[col].median())
print("--- Before Scaling the Features ---")
X.head()


--- Before Scaling the Features ---


Unnamed: 0,Income_$,SpendingScore
0,33,78
1,25,87
2,24,88
3,25,73
4,23,88


In [37]:
# 3. Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# converting the X[Income_$,SpendingScore] Scaled to a DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns = ["Income_$","SpendingScore"])
print("--- After Scaling the Features ---")
X_scaled_df.head()

--- After Scaling the Features ---


Unnamed: 0,Income_$,SpendingScore
0,-0.62783,0.729981
1,-0.890315,1.073591
2,-0.923126,1.11177
3,-0.890315,0.539086
4,-0.955936,1.11177


In [38]:
# 4. Choosing the best K-Means by using (Elbow method)
# this function Calculates and prints the Sum of Squared Errors (SSE) for K-Meansclustering on a given dataset for k values from 1 to 10.
def get_kmeans_sse(scaled_data):
 print("--- Sum of (SSE) for each value of k ---")
 for k in range(1, 11):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(scaled_data)
    print(f"k = {k} -> SSE = {km.inertia_:.2f}")
get_kmeans_sse(X_scaled)

--- Sum of (SSE) for each value of k ---
k = 1 -> SSE = 400.00
k = 2 -> SSE = 199.70
k = 3 -> SSE = 79.37
k = 4 -> SSE = 21.37
k = 5 -> SSE = 19.09
k = 6 -> SSE = 15.65
k = 7 -> SSE = 14.48
k = 8 -> SSE = 13.81
k = 9 -> SSE = 12.94
k = 10 -> SSE = 11.52


In [50]:
# 5. training the model
kmeans = KMeans(n_clusters=4, random_state=42)
# geeting the label
labels = kmeans.fit_predict(X_scaled)
# adding the label to the dataframe 
df["Cluster"] = labels.astype(int)
df.head()
# print(df["Cluster"].value_counts())

Unnamed: 0,CustomerID,Age,Income_$,SpendingScore,VisitsPerMonth,OnlinePurchases,Gender,Region,Cluster
0,1,28,33,78,14,9,Female,East,2
1,2,21,25,87,8,23,Male,North,2
2,3,23,24,88,13,10,Male,South,2
3,4,24,25,73,16,11,Female,West,2
4,5,20,23,88,17,16,Male,West,2


In [52]:
# 6. Evaluating the Clustring
sil_score = silhouette_score(X_scaled,labels)
db_score = davies_bouldin_score(X_scaled,labels)
print(f"Silhouette Score: {sil_score:.2f} (closer to +1 is better)")
print(f"Davies–Bouldin:   {db_score:.2f} (lower is better)")

Silhouette Score: 0.73 (closer to +1 is better)
Davies–Bouldin:   0.39 (lower is better)


In [None]:
# 7.Cluster centers (back to original units)
center_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(center_scaled)
print("=== CLUSTER CENTERS (Original Units) ===")
center_df = pd.DataFrame(centers_original,columns=FEATURES)
center_df.index.name = "Cluster"
center_df.head(10)

=== CLUSTER CENTERS (Original Units) ===


Unnamed: 0_level_0,Income_$,SpendingScore
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,56.32,53.58
1,28.92,19.6
2,24.14,83.1
3,99.16,79.24


In [120]:
# Sanity checks (3 customers)
sanity_check_sample = df.sample(n=3)[FEATURES + ['Cluster']]

print("--- Sanity Check: 3 Random Customers ---")
print(sanity_check_sample)

--- Sanity Check: 3 Random Customers ---
     Income_$  SpendingScore  Cluster
124        23             23        1
49         21             72        2
129        23             28        1


In [124]:
# 9. Save labeled dataset
df.to_csv("spending_labeled_clusters.csv",index=False)
print(f"Saved clustered dataset → spending_labeled_clusters.csv")

Saved clustered dataset → spending_labeled_clusters.csv
