In [None]:
import pandas as pd
from test import standardized_df
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import adjusted_rand_score, homogeneity_score, completeness_score, accuracy_score
from scipy.optimize import linear_sum_assignment

inertias = []
K_range = range(1, 11)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(standardized_df)
    inertias.append(km.inertia_)


plt.figure(figsize=(6,4))
plt.plot(K_range, inertias, marker="o")
plt.title("Elbow Method for K")
plt.xlabel("Number of clusters (K)")
plt.ylabel("Inertia")
plt.show()

best_k = 3
kmeans = KMeans(n_clusters=best_k, random_state=42)
y_kmeans = kmeans.fit_predict(standardized_df)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(standardized_df)


plt.figure(figsize=(6,4))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=y_kmeans, palette="Set1")
plt.title("K-Means Clusters (PCA 2D)")
plt.show()

linkage_matrix = linkage(standardized_df, method='ward')
plt.figure(figsize=(8,4))
dendrogram(linkage_matrix, truncate_mode="level", p=5)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Samples")
plt.ylabel("Distance")
plt.show()


agg = AgglomerativeClustering(n_clusters=best_k, linkage='ward')
y_hc = agg.fit_predict(standardized_df)


plt.figure(figsize=(6,4))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=y_hc, palette="Set2")
plt.title("Hierarchical Clusters (PCA 2D)")
plt.show()

def clustering_accuracy(y_true, y_pred):
    labels = np.unique(y_pred)
    cost_matrix = np.zeros((len(labels), len(labels)), dtype=int)
    for i, c1 in enumerate(labels):
        for j, c2 in enumerate(labels):
            cost_matrix[i, j] = np.sum((y_pred == c1) & (y_true == c2))
    row_ind, col_ind = linear_sum_assignment(-cost_matrix)
    mapping = {row: col for row, col in zip(row_ind, col_ind)}
    y_mapped = [mapping[label] for label in y_pred]
    return accuracy_score(y_true, y_mapped)


models = {
"KMeans": y_kmeans,
"Hierarchical": y_hc
}

y_true = standardized_df.index % best_k 
results = []
for name, y_pred in models.items():
    results.append({
    "Model": name,
    "ARI": adjusted_rand_score(y_true, y_pred),
    "Homogeneity": homogeneity_score(y_true, y_pred),
    "Completeness": completeness_score(y_true, y_pred),
    "Accuracy": clustering_accuracy(y_true, y_pred)
    })


results_df = pd.DataFrame(results)
print(results_df)