In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
df=pd.read_csv("spending_dataset.csv")

In [2]:
print(df.head())

   CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  OnlinePurchases  \
0           1   28        33             78              14                9   
1           2   21        25             87               8               23   
2           3   23        24             88              13               10   
3           4   24        25             73              16               11   
4           5   20        23             88              17               16   

   Gender Region  
0  Female   East  
1    Male  North  
2    Male  South  
3  Female   West  
4    Male   West  


In [8]:
features= ["Income_$", "SpendingScore"]
X=df[features].copy()
print("before scalling :" ,X.head())

before scalling :    Income_$  SpendingScore
0        33             78
1        25             87
2        24             88
3        25             73
4        23             88


In [9]:
scaler=StandardScaler()
x_Scaled=scaler.fit_transform(X)
print("After Scalling : " , x_Scaled[:10])

After Scalling :  [[-0.62783049  0.72998073]
 [-0.89031514  1.07359091]
 [-0.92312573  1.11176982]
 [-0.89031514  0.53908619]
 [-0.95593631  1.11176982]
 [-0.98874689  0.80633855]
 [-1.18561038  1.41720108]
 [-0.85750456  1.035412  ]
 [-0.95593631  1.4935589 ]
 [-0.75907282  0.84451746]]


In [16]:
print("elbow method (sse per k")
for K in range(1,10):
    km=KMeans(n_clusters=K, random_state=42)
    km.fit(x_Scaled)
    print(f"k={K} -> SSE={km.inertia_:.2f}")
    

elbow method (sse per k
k=1 -> SSE=400.00
k=2 -> SSE=199.70
k=3 -> SSE=79.37
k=4 -> SSE=21.37
k=5 -> SSE=19.09
k=6 -> SSE=15.65
k=7 -> SSE=14.48
k=8 -> SSE=13.81
k=9 -> SSE=12.94




In [22]:
kmeans=KMeans(n_clusters=4, random_state=42)
labels=kmeans.fit_predict(x_Scaled)
df["cluster"]= labels.astype(int)
print("sample with clusters")
print(df.head(150))

sample with clusters
     CustomerID  Age  Income_$  SpendingScore  VisitsPerMonth  \
0             1   28        33             78              14   
1             2   21        25             87               8   
2             3   23        24             88              13   
3             4   24        25             73              16   
4             5   20        23             88              17   
..          ...  ...       ...            ...             ...   
145         146   59        22             23               2   
146         147   57        31             22               3   
147         148   58        30             19               1   
148         149   59        39             23               3   
149         150   57        26             19               2   

     OnlinePurchases  Gender Region  cluster  
0                  9  Female   East        2  
1                 23    Male  North        2  
2                 10    Male  South        2  
3         



In [25]:
SIL=silhouette_score(x_Scaled, labels)
DBI=davies_bouldin_score(x_Scaled, labels)
print(f"silhouette score: {SIL:.3f}")
print(f"Davies Bouldin Index: {DBI:.3f}")

silhouette score: 0.729
Davies Bouldin Index: 0.387


In [26]:
# wixi aad scle garaysay dib ugu celiso
centers_scaled = kmeans.cluster_centers_
centers_original=scaler.inverse_transform(centers_scaled)

centers_df=pd.DataFrame(centers_original, columns=features)

print("Changed original units :" , centers_df.round(2))

Changed original units :    Income_$  SpendingScore
0     56.32          53.58
1     28.92          19.60
2     24.14          83.10
3     99.16          79.24


In [28]:
sample_idx=[0,1,5]
sanity=df.loc[sample_idx, features+["cluster"]]
print("sanity check 3 customers")
print(sanity)

sanity check 3 customers
   Income_$  SpendingScore  cluster
0        33             78        2
1        25             87        2
5        22             80        2


In [29]:
OUTPATH="Spending_labeled_clustersdata.csv"

df.to_csv(OUTPATH, index=False)