In [1]:
# Imports and Setup
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [2]:
# Data Loading
df = pd.read_csv("spending_l9_dataset.csv")
print("Initial snapshot of the dataset:")
print(df.head())

Initial snapshot of the dataset:
   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  
0  Female   East  
1    Male  North  
2    Male  South  
3  Female   West  
4    Male   West  


In [3]:
# Feature Selection and Missing Value Handling
FEATURES = ["Income_$", "SpendingScore"]
X = df[FEATURES].copy()
print("\nFeature information for clustering:")
print(X.info())

X = X.fillna(X.median(numeric_only=True))
print("\nFirst few rows of selected features:")
print(X.head())


Feature information for clustering:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Income_$       200 non-null    int64
 1   SpendingScore  200 non-null    int64
dtypes: int64(2)
memory usage: 3.3 KB
None

First few rows of selected features:
   Income_$  SpendingScore
0        33             78
1        25             87
2        24             88
3        25             73
4        23             88


In [4]:
# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("\nShape of scaled features:", X_scaled.shape)


Shape of scaled features: (200, 2)


In [5]:
# Elbow Method (SSE for k=1..10)
def print_sse(X_scaled, max_k=10):
    print("\nSSE values for different k (Elbow Method):")
    sse = []
    for k in range(1, max_k+1):
        km = KMeans(n_clusters=k, n_init="auto", random_state=42)
        km.fit(X_scaled)
        sse.append(km.inertia_)
        print(f"k={k} → SSE={km.inertia_:.2f}")
    return sse

sse_list = print_sse(X_scaled)


SSE values for different k (Elbow Method):
k=1 → SSE=400.00
k=2 → SSE=199.70
k=3 → SSE=79.37
k=4 → SSE=21.37
k=5 → SSE=19.09
k=6 → SSE=15.65
k=7 → SSE=14.48
k=8 → SSE=13.81
k=9 → SSE=12.94
k=10 → SSE=11.52


In [7]:
# Fit K-Means with Chosen k and Assign Clusters
K = 4
kmeans = KMeans(n_clusters=K, n_init="auto", random_state=42)
labels = kmeans.fit_predict(X_scaled)
df["Cluster"] = labels.astype(int)
print("\nSample of customers with their assigned clusters:")
print(df.head(30))


Sample of customers with their assigned clusters:
    CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0            1   28        33             78              14                9   
1            2   21        25             87               8               23   
2            3   23        24             88              13               10   
3            4   24        25             73              16               11   
4            5   20        23             88              17               16   
5            6   23        22             80              19               21   
6            7   22        16             96               9               19   
7            8   18        26             86              16               22   
8            9   26        23             98              17               15   
9           10   24        29             81              16               17   
10          11   21        22             82              

In [8]:
# Print Cluster Sizes
print("\nNumber of customers in each cluster:")
print(df["Cluster"].value_counts().sort_index())


Number of customers in each cluster:
Cluster
0    50
1    50
2    50
3    50
Name: count, dtype: int64


In [9]:
# Evaluate Clustering
sil_score = silhouette_score(X_scaled, labels)
db_score  = davies_bouldin_score(X_scaled, labels)
print(f"\nSilhouette Score: {sil_score:.4f} (higher is better)")
print(f"Davies-Bouldin Score: {db_score:.4f} (lower is better)")


Silhouette Score: 0.7286 (higher is better)
Davies-Bouldin Score: 0.3871 (lower is better)


In [10]:
# Cluster Centers (Original Units)
centers_scaled = kmeans.cluster_centers_
centers_original = scaler.inverse_transform(centers_scaled)
centers_df = pd.DataFrame(centers_original, columns=FEATURES)
centers_df.index.name = "Cluster"
print("\nCluster centers in original units:")
print(centers_df.round(2))


Cluster centers in original units:
         Income_$  SpendingScore
Cluster                         
0           56.32          53.58
1           28.92          19.60
2           24.14          83.10
3           99.16          79.24


In [11]:
# Average Feature Values per Cluster (Summary)
print("\nAverage feature values per cluster:")
print(df.groupby("Cluster")[FEATURES].mean().round(2))


Average feature values per cluster:
         Income_$  SpendingScore
Cluster                         
0           56.32          53.58
1           28.92          19.60
2           24.14          83.10
3           99.16          79.24


In [12]:
# Sanity Check (Random 3 Samples)
print("\nRandomly selected customers and their clusters:")
print(df.sample(3)[FEATURES + ["Cluster"]])


Randomly selected customers and their clusters:
     Income_$  SpendingScore  Cluster
198       105             72        3
8          23             98        2
55         56             55        0


In [13]:
# Cell 11: Save Labeled Dataset
df.to_csv("spending_labeled_clusters.csv", index=False)
print("Saved clustered dataset to spending_labeled_clusters.csv")

Saved clustered dataset to spending_labeled_clusters.csv
