In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, adjusted_rand_score
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv('C:/All Programming/Udamy Ml course/Machine Learning/dataset/mall.csv')

x = df.iloc[:, [3,4]].values

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [5]:
kmeans = KMeans(n_clusters=4, n_init=10, random_state=42)
clusters = kmeans.fit_predict(x_scaled)

**Internal Metrics (No Ground Truth)**

```
Silhouette Score: Measures how similar points are to their cluster vs other clusters 
```
- Range: -1 to 1 (higher is better)

In [6]:
silhouette = silhouette_score(x_scaled, clusters)
print(f"Silhouette Score: {silhouette:.3f}")

Silhouette Score: 0.494


```
Calinski-Harabasz Score: Ratio of between-cluster to within-cluster variance
```
- Higher is better

In [7]:
calinski = calinski_harabasz_score(x_scaled, clusters)
print(f"Calinski-Harabasz Score: {calinski:.2f}")

Calinski-Harabasz Score: 174.60


```
Davies-Bouldin Score: Average similarity ratio of clusters
```
- Lower is better

In [8]:
davies = davies_bouldin_score(x_scaled, clusters)
print(f"Davies-Bouldin Score: {davies:.3f}")

Davies-Bouldin Score: 0.710


**External Metrics (With Ground Truth)**

```
Adjusted Rand Index: Measures similarity between clusters and true labels
```
- Range: -1 to 1 (1 = perfect match)

In [None]:
ari = adjusted_rand_score(x_scaled, clusters)
print(f"Adjusted Rand Index: {ari:.3f}")