# Evaluation

We are now evaluating our best models. Our best models from the model optimization phase are K-Means and Hierarchical Clustering.

In [6]:
import numpy as np
from sklearn.metrics import (
    silhouette_score,
    davies_bouldin_score,
    calinski_harabasz_score,
    adjusted_rand_score,
    normalized_mutual_info_score,
    v_measure_score,
)
from sklearn.utils import resample
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering


df = pd.read_csv("../data/processed/X_umap.csv")

In [7]:
def comprehensive_cluster_evaluation(data, labels, external_labels=None):
    """Comprehensive evaluation using multiple metrics"""

    print("COMPREHENSIVE CLUSTER EVALUATION")
    print("=" * 33)

    # Basic cluster information
    unique_labels = np.unique(labels)
    n_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
    n_noise = np.sum(labels == -1) if -1 in unique_labels else 0

    print(f"Clusters found: {n_clusters}")
    print(f"Noise points: {n_noise}")
    print(f"Total samples: {len(labels)}")

    results = {"n_clusters": n_clusters, "n_noise": n_noise}

    # Internal metrics (skip if insufficient clusters)
    if n_clusters > 1:
        print(f"\nINTERNAL METRICS:")
        print("-" * 16)

        # For DBSCAN, exclude noise points from evaluation
        if -1 in unique_labels:
            non_noise_mask = labels != -1
            eval_data = data[non_noise_mask]
            eval_labels = labels[non_noise_mask]
        else:
            eval_data = data
            eval_labels = labels

        # Calculate internal metrics
        silhouette = silhouette_score(eval_data, eval_labels)
        davies_bouldin = davies_bouldin_score(eval_data, eval_labels)
        calinski_harabasz = calinski_harabasz_score(eval_data, eval_labels)

        print(f"Silhouette Score: {silhouette:.3f} (higher = better)")
        print(f"Davies-Bouldin Index: {davies_bouldin:.3f} (lower = better)")
        print(f"Calinski-Harabasz Index: {calinski_harabasz:.1f} (higher = better)")

        results.update(
            {
                "silhouette": silhouette,
                "davies_bouldin": davies_bouldin,
                "calinski_harabasz": calinski_harabasz,
            }
        )

        # Interpret results
        sil_quality = (
            "Excellent"
            if silhouette > 0.7
            else "Good"
            if silhouette > 0.5
            else "Fair"
            if silhouette > 0.25
            else "Poor"
        )
        print(f"Overall quality assessment: {sil_quality}")

    else:
        print(f"\n⚠️ Insufficient clusters for internal metrics")
        results.update(
            {"silhouette": -1, "davies_bouldin": np.inf, "calinski_harabasz": 0}
        )

    # External metrics (if external labels provided)
    if external_labels is not None and n_clusters > 1:
        print(f"\nEXTERNAL METRICS:")
        print("-" * 16)

        ari = adjusted_rand_score(external_labels, labels)
        nmi = normalized_mutual_info_score(external_labels, labels)
        v_measure = v_measure_score(external_labels, labels)

        print(f"Adjusted Rand Index: {ari:.3f} (higher = better)")
        print(f"Normalized Mutual Information: {nmi:.3f} (higher = better)")
        print(f"V-Measure: {v_measure:.3f} (higher = better)")

        results.update({"ari": ari, "nmi": nmi, "v_measure": v_measure})

        # Interpret external validation
        ext_quality = "Strong" if ari > 0.7 else "Moderate" if ari > 0.4 else "Weak"
        print(f"External validation: {ext_quality} agreement")

    return results


def stability_evaluation(data, clustering_func, n_trials=20):
    """Evaluate clustering stability via bootstrap"""

    print(f"\nSTABILITY EVALUATION ({n_trials} trials)")
    print("=" * 30)

    stability_scores = []

    # Generate reference clustering
    reference_labels = clustering_func(data)

    for trial in range(n_trials):
        # Bootstrap sample
        bootstrap_data = resample(data, random_state=trial)
        bootstrap_labels = clustering_func(bootstrap_data)

        # Calculate stability (simplified - assumes same data size)
        if len(bootstrap_labels) == len(reference_labels):
            # Find the indices of the bootstrap samples in the original data
            bootstrap_indices = (
                bootstrap_data.index
                if hasattr(bootstrap_data, "index")
                else np.arange(len(bootstrap_data))
            )
            # Align reference_labels to the bootstrap sample
            aligned_reference_labels = reference_labels[bootstrap_indices]
            stability = adjusted_rand_score(aligned_reference_labels, bootstrap_labels)
            stability_scores.append(stability)

    if stability_scores:
        mean_stability = np.mean(stability_scores)
        std_stability = np.std(stability_scores)

        print(f"Mean stability (ARI): {mean_stability:.3f} ± {std_stability:.3f}")

        if mean_stability > 0.8:
            stability_assessment = "✅ Highly stable"
        elif mean_stability > 0.6:
            stability_assessment = "⚡ Moderately stable"
        else:
            stability_assessment = "❌ Unstable"

        print(f"Stability assessment: {stability_assessment}")

        return {
            "mean_stability": mean_stability,
            "std_stability": std_stability,
            "assessment": stability_assessment,
        }
    else:
        print("❌ Stability evaluation failed")
        return None


def kmean_clustering_func(data):
    return KMeans(n_clusters=4, random_state=42).fit_predict(data)


kmean_labels = kmean_clustering_func(df)

def hierarchical_clustering_func(data):
    return AgglomerativeClustering(linkage="average", n_clusters=4).fit_predict(data)


hierarchical_labels = hierarchical_clustering_func(df)


kmean_eval_results = comprehensive_cluster_evaluation(df, kmean_labels)
kmean_stability_results = stability_evaluation(df, kmean_clustering_func)

hierarchical_eval_results = comprehensive_cluster_evaluation(
    df, hierarchical_labels
)
hierarchical_stability_results = stability_evaluation(
    df, hierarchical_clustering_func
)

COMPREHENSIVE CLUSTER EVALUATION
Clusters found: 4
Noise points: 0
Total samples: 185

INTERNAL METRICS:
----------------
Silhouette Score: 0.563 (higher = better)
Davies-Bouldin Index: 0.650 (lower = better)
Calinski-Harabasz Index: 748.0 (higher = better)
Overall quality assessment: Good

STABILITY EVALUATION (20 trials)
Mean stability (ARI): 0.914 ± 0.060
Stability assessment: ✅ Highly stable
COMPREHENSIVE CLUSTER EVALUATION
Clusters found: 4
Noise points: 0
Total samples: 185

INTERNAL METRICS:
----------------
Silhouette Score: 0.565 (higher = better)
Davies-Bouldin Index: 0.676 (lower = better)
Calinski-Harabasz Index: 724.0 (higher = better)
Overall quality assessment: Good

STABILITY EVALUATION (20 trials)
Mean stability (ARI): 0.809 ± 0.095
Stability assessment: ✅ Highly stable
