In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [3]:
# 1) Load Dataset
df = pd.read_csv('spending_l9_dataset.csv')
print(df.head(10))

   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   
5           6   23        22             80              19               21   
6           7   22        16             96               9               19   
7           8   18        26             86              16               22   
8           9   26        23             98              17               15   
9          10   24        29             81              16               17   

   Gender Region  
0  Female   East  
1    Male  North  
2    Male  South  
3  Female   West  
4    Male   West  
5  Fe

In [4]:
# 2) Select features for clustering
Features = ['Income_$', 'SpendingScore']
X = df[Features].copy()

print(X.head(10))

   Income_$  SpendingScore
0        33             78
1        25             87
2        24             88
3        25             73
4        23             88
5        22             80
6        16             96
7        26             86
8        23             98
9        29             81


In [5]:
# 3) Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\nScaled shape:", X_scaled.shape)


Scaled shape: (200, 2)


In [6]:
# 4) Elbow method (print SSE)
for k in range(1, 11):
    km = KMeans(n_clusters=k, n_init='auto', random_state=42)
    km.fit(X_scaled)
    sse = km.inertia_
    print(f'K={k} -> SSE={sse:.2f}')

K=1 -> SSE=400.00
K=2 -> SSE=199.70
K=3 -> SSE=79.37
K=4 -> SSE=21.37
K=5 -> SSE=19.09
K=6 -> SSE=15.65
K=7 -> SSE=14.48
K=8 -> SSE=13.81
K=9 -> SSE=12.94
K=10 -> SSE=11.52


In [7]:
# 5) Fit K-Means with chosen k

k = 4
kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42)
label = kmeans.fit_predict(X_scaled)

df['Cluster'] = label.astype(int)
print("\nClustered DataFrame:\n", df.head(200))


Clustered DataFrame:
      CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  \
0             1   28        33             78              14   
1             2   21        25             87               8   
2             3   23        24             88              13   
3             4   24        25             73              16   
4             5   20        23             88              17   
..          ...  ...       ...            ...             ...   
195         196   35       112             76              18   
196         197   40       115             87              17   
197         198   38        86             73              15   
198         199   36       105             72              10   
199         200   41       115             95              15   

     OnlinePurchases  Gender Region  Cluster  
0                  9  Female   East        2  
1                 23    Male  North        2  
2                 10    Male  South        2  
3       

In [8]:
# 6) Evaluate clustering
sil_score = silhouette_score(X_scaled, label)
db_score = davies_bouldin_score(X_scaled, label)

print(f'\nSilhouette Score: {sil_score:.3f}')
print(f'Davies-Bouldin Score: {db_score:.3f}')


Silhouette Score: 0.729
Davies-Bouldin Score: 0.387


In [9]:
# 7) Cluster centers (back to original units)
centers = scaler.inverse_transform(kmeans.cluster_centers_)
centers_df = pd.DataFrame(centers, columns=Features)
print("\nOriginal Cluster Centers:\n", centers_df.round(2))


Original Cluster Centers:
    Income_$  SpendingScore
0     56.32          53.58
1     28.92          19.60
2     24.14          83.10
3     99.16          79.24


In [10]:
# 8) Sanity checks (3 customers)

sample_indexes = [89, 2, 150]
sanity_checks = df.loc[sample_indexes, Features + ['Cluster']]
print("\nSanity Checks:\n", sanity_checks)


Sanity Checks:
      Income_$  SpendingScore  Cluster
89         56             54        0
2          24             88        2
150       100             76        3


In [11]:
# 9) Save labeled dataset

OUT_PATH = " customer_spending_segmentation_label.csv"
df.to_csv(OUT_PATH, index=False)
print(f"\nSaved clustered dataset → {OUT_PATH}")


Saved clustered dataset →  customer_spending_segmentation_label.csv
