# 05 - Unsupervised Learning - Clustering

This notebook covers:
1. Loading preprocessed data
2. K-Means Clustering with elbow method
3. Hierarchical Clustering with dendrogram
4. Cluster analysis and visualization
5. Comparing clusters with actual disease labels


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, adjusted_rand_score
from scipy.cluster.hierarchy import dendrogram, linkage
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


In [None]:
# Load preprocessed data
print("Loading preprocessed data...")
X_train = joblib.load('../data/X_train.pkl')
X_test = joblib.load('../data/X_test.pkl')
y_train = joblib.load('../data/y_train.pkl')
y_test = joblib.load('../data/y_test.pkl')

# Combine train and test for clustering
X_combined = pd.concat([X_train, X_test], axis=0)
y_combined = pd.concat([y_train, y_test], axis=0)

print(f"Combined data shape: {X_combined.shape}")
print(f"Target distribution: {y_combined.value_counts().to_dict()}")

# Display first few rows
print("\nFirst 5 rows of combined data:")
print(X_combined.head())


In [None]:
# K-Means Clustering with Elbow Method
print("K-Means Clustering Analysis")
print("=" * 40)

# Determine optimal number of clusters using elbow method
inertias = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_combined)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_combined, kmeans.labels_))

# Plot elbow method and silhouette scores
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 2)
plt.plot(K_range, silhouette_scores, 'ro-', linewidth=2, markersize=8)
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters')
plt.grid(True, alpha=0.3)

# Find optimal k based on silhouette score
optimal_k = K_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters based on silhouette score: {optimal_k}")
print(f"Best silhouette score: {max(silhouette_scores):.4f}")

# Apply K-Means with optimal k
kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
kmeans_labels = kmeans_optimal.fit_predict(X_combined)

print(f"\nK-Means clustering completed with {optimal_k} clusters")
print(f"Cluster distribution: {np.bincount(kmeans_labels)}")


In [None]:
# Hierarchical Clustering
print("\nHierarchical Clustering Analysis")
print("=" * 40)

# Create linkage matrix
linkage_matrix = linkage(X_combined, method='ward')

# Plot dendrogram
plt.figure(figsize=(15, 8))
dendrogram(linkage_matrix, truncate_mode='level', p=5)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()

# Apply hierarchical clustering with optimal number of clusters
hierarchical = AgglomerativeClustering(n_clusters=optimal_k, linkage='ward')
hierarchical_labels = hierarchical.fit_predict(X_combined)

print(f"Hierarchical clustering completed with {optimal_k} clusters")
print(f"Cluster distribution: {np.bincount(hierarchical_labels)}")

# Calculate silhouette scores
kmeans_silhouette = silhouette_score(X_combined, kmeans_labels)
hierarchical_silhouette = silhouette_score(X_combined, hierarchical_labels)

print(f"\nSilhouette Scores:")
print(f"K-Means: {kmeans_silhouette:.4f}")
print(f"Hierarchical: {hierarchical_silhouette:.4f}")


In [None]:
# Cluster Analysis and Visualization
print("\nCluster Analysis")
print("=" * 40)

# Compare clusters with actual disease labels
def analyze_clusters(labels, true_labels, method_name):
    print(f"\n{method_name} Cluster Analysis:")
    
    # Create cross-tabulation
    cluster_df = pd.DataFrame({
        'Cluster': labels,
        'True_Label': true_labels
    })
    
    cross_tab = pd.crosstab(cluster_df['Cluster'], cluster_df['True_Label'], margins=True)
    print("Cluster vs True Label Cross-tabulation:")
    print(cross_tab)
    
    # Calculate cluster purity
    cluster_purity = []
    for cluster in range(optimal_k):
        cluster_data = cluster_df[cluster_df['Cluster'] == cluster]
        if len(cluster_data) > 0:
            purity = cluster_data['True_Label'].value_counts().max() / len(cluster_data)
            cluster_purity.append(purity)
        else:
            cluster_purity.append(0)
    
    avg_purity = np.mean(cluster_purity)
    print(f"Average cluster purity: {avg_purity:.4f}")
    
    # Calculate adjusted rand index
    ari = adjusted_rand_score(true_labels, labels)
    print(f"Adjusted Rand Index: {ari:.4f}")
    
    return cluster_df, cross_tab, avg_purity, ari

# Analyze both clustering methods
kmeans_analysis = analyze_clusters(kmeans_labels, y_combined, "K-Means")
hierarchical_analysis = analyze_clusters(hierarchical_labels, y_combined, "Hierarchical")


In [None]:
# Visualize clustering results
plt.figure(figsize=(20, 12))

# Use PCA for 2D visualization
from sklearn.decomposition import PCA
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_combined)

# 1. K-Means clusters
plt.subplot(2, 4, 1)
scatter = plt.scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.7)
plt.xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.2%} variance)')
plt.title('K-Means Clustering')
plt.colorbar(scatter, label='Cluster')

# 2. Hierarchical clusters
plt.subplot(2, 4, 2)
scatter = plt.scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=hierarchical_labels, cmap='viridis', alpha=0.7)
plt.xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.2%} variance)')
plt.title('Hierarchical Clustering')
plt.colorbar(scatter, label='Cluster')

# 3. True labels
plt.subplot(2, 4, 3)
scatter = plt.scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=y_combined, cmap='coolwarm', alpha=0.7)
plt.xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.2%} variance)')
plt.ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.2%} variance)')
plt.title('True Labels')
plt.colorbar(scatter, label='Heart Disease')

# 4. K-Means vs True labels
plt.subplot(2, 4, 4)
kmeans_df, kmeans_cross_tab, _, _ = kmeans_analysis
sns.heatmap(kmeans_cross_tab.iloc[:-1, :-1], annot=True, fmt='d', cmap='Blues')
plt.title('K-Means: Cluster vs True Label')
plt.xlabel('True Label')
plt.ylabel('Cluster')

# 5. Hierarchical vs True labels
plt.subplot(2, 4, 5)
hier_df, hier_cross_tab, _, _ = hierarchical_analysis
sns.heatmap(hier_cross_tab.iloc[:-1, :-1], annot=True, fmt='d', cmap='Greens')
plt.title('Hierarchical: Cluster vs True Label')
plt.xlabel('True Label')
plt.ylabel('Cluster')

# 6. Performance comparison
plt.subplot(2, 4, 6)
methods = ['K-Means', 'Hierarchical']
silhouette_scores = [kmeans_silhouette, hierarchical_silhouette]
ari_scores = [kmeans_analysis[3], hierarchical_analysis[3]]
purity_scores = [kmeans_analysis[2], hierarchical_analysis[2]]

x = np.arange(len(methods))
width = 0.25

plt.bar(x - width, silhouette_scores, width, label='Silhouette Score', alpha=0.8)
plt.bar(x, ari_scores, width, label='ARI Score', alpha=0.8)
plt.bar(x + width, purity_scores, width, label='Purity Score', alpha=0.8)

plt.xlabel('Clustering Method')
plt.ylabel('Score')
plt.title('Clustering Performance Comparison')
plt.xticks(x, methods)
plt.legend()
plt.ylim(0, 1)

# 7. Cluster size distribution
plt.subplot(2, 4, 7)
kmeans_counts = np.bincount(kmeans_labels)
hier_counts = np.bincount(hierarchical_labels)
x = np.arange(optimal_k)

plt.bar(x - 0.2, kmeans_counts, 0.4, label='K-Means', alpha=0.8)
plt.bar(x + 0.2, hier_counts, 0.4, label='Hierarchical', alpha=0.8)

plt.xlabel('Cluster')
plt.ylabel('Number of Samples')
plt.title('Cluster Size Distribution')
plt.xticks(x)
plt.legend()

# 8. Feature importance in clusters (using first few features)
plt.subplot(2, 4, 8)
feature_names = X_combined.columns[:5]  # First 5 features
cluster_means = []
for cluster in range(optimal_k):
    cluster_data = X_combined[kmeans_labels == cluster]
    cluster_means.append(cluster_data[feature_names].mean())

cluster_means_df = pd.DataFrame(cluster_means, columns=feature_names)
sns.heatmap(cluster_means_df.T, annot=True, fmt='.2f', cmap='RdYlBu_r')
plt.title('K-Means: Feature Means by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Feature')

plt.tight_layout()
plt.show()


In [None]:
# Save clustering models and results
import os

# Create directories if they don't exist
os.makedirs('../models', exist_ok=True)
os.makedirs('../results', exist_ok=True)

# Save clustering models
joblib.dump(kmeans_optimal, '../models/kmeans_model.pkl')
joblib.dump(hierarchical, '../models/hierarchical_model.pkl')
joblib.dump(pca_2d, '../models/pca_2d_model.pkl')

# Save clustering results
clustering_results = {
    'kmeans': {
        'model': kmeans_optimal,
        'labels': kmeans_labels,
        'silhouette_score': kmeans_silhouette,
        'ari_score': kmeans_analysis[3],
        'purity_score': kmeans_analysis[2],
        'cross_tab': kmeans_analysis[1]
    },
    'hierarchical': {
        'model': hierarchical,
        'labels': hierarchical_labels,
        'silhouette_score': hierarchical_silhouette,
        'ari_score': hierarchical_analysis[3],
        'purity_score': hierarchical_analysis[2],
        'cross_tab': hierarchical_analysis[1]
    },
    'optimal_k': optimal_k,
    'linkage_matrix': linkage_matrix
}

joblib.dump(clustering_results, '../models/clustering_results.pkl')

# Save clustering evaluation to text file
with open('../results/clustering_evaluation.txt', 'w') as f:
    f.write("Heart Disease Prediction - Clustering Evaluation Results\n")
    f.write("=" * 60 + "\n\n")
    
    f.write(f"Optimal number of clusters: {optimal_k}\n\n")
    
    f.write("K-Means Clustering:\n")
    f.write(f"  Silhouette Score: {kmeans_silhouette:.4f}\n")
    f.write(f"  Adjusted Rand Index: {kmeans_analysis[3]:.4f}\n")
    f.write(f"  Average Purity: {kmeans_analysis[2]:.4f}\n")
    f.write(f"  Cluster distribution: {np.bincount(kmeans_labels).tolist()}\n\n")
    
    f.write("Hierarchical Clustering:\n")
    f.write(f"  Silhouette Score: {hierarchical_silhouette:.4f}\n")
    f.write(f"  Adjusted Rand Index: {hierarchical_analysis[3]:.4f}\n")
    f.write(f"  Average Purity: {hierarchical_analysis[2]:.4f}\n")
    f.write(f"  Cluster distribution: {np.bincount(hierarchical_labels).tolist()}\n\n")
    
    f.write("Cross-tabulation Tables:\n")
    f.write("K-Means:\n")
    f.write(str(kmeans_analysis[1]))
    f.write("\n\nHierarchical:\n")
    f.write(str(hierarchical_analysis[1]))

print("Unsupervised learning completed and models saved!")
print("Files saved:")
print("- ../models/kmeans_model.pkl")
print("- ../models/hierarchical_model.pkl")
print("- ../models/pca_2d_model.pkl")
print("- ../models/clustering_results.pkl")
print("- ../results/clustering_evaluation.txt")

# Display final summary
print(f"\nClustering Summary:")
print(f"- Optimal number of clusters: {optimal_k}")
print(f"- Best clustering method: {'K-Means' if kmeans_silhouette > hierarchical_silhouette else 'Hierarchical'}")
print(f"- Best silhouette score: {max(kmeans_silhouette, hierarchical_silhouette):.4f}")
print(f"- All clustering models trained and saved successfully!")
