In [None]:
# Load the PCA features (cluster on PCs)

from pathlib import Path
import pandas as pd

PROJECT = Path.cwd()
PROCESSED = PROJECT / "data" / "processed"

pcs = pd.read_csv(PROCESSED / "pca_20pcs.csv", index_col=0)
print("pcs:", pcs.shape)



In [None]:
# Run k-means many times and compare to a reference run
#Adjusted Rand Index (ARI) will be used where:
#ARI ≈ 1.0 = same clustering
#ARI ≈ 0.0 = unrelated
#ARI < 0 can happen (worse than random)


from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

k = 3

# reference clustering
ref = KMeans(n_clusters=k, n_init=25, random_state=1).fit_predict(pcs)

aris = []
for seed in range(2, 22):  # 20 runs
    labels = KMeans(n_clusters=k, n_init=25, random_state=seed).fit_predict(pcs)
    aris.append(adjusted_rand_score(ref, labels))

print("ARI vs reference (20 runs):")
print(" min:", min(aris))
print(" median:", sorted(aris)[len(aris)//2])
print(" max:", max(aris))

# min ARI ≈ 0.53 (some runs give quite different clusters)
# median ARI ≈ 0.86 (often similar)
# max ARI ≈ 0.93 (sometimes very similar)

# So: the “subtypes” are partly sensitive to randomness, 
# which usually means k-means is struggling because the structure isn’t strongly separated (or k isn’t ideal).

# Therefore, let's keep the method the same, but test a few values of k and see which one is most stable.




In [None]:
# Show the full ARI list
aris

