# 03. Clustering Analysis

This notebook performs clustering analysis to identify groups of users with similar music tastes using various clustering algorithms.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')

## Load Engineered Features

In [None]:
# Load different feature sets
user_features_full = pd.read_csv('../data/processed/user_features_scaled.csv')
user_features_pca = pd.read_csv('../data/processed/user_features_pca.csv')
user_features_selected = pd.read_csv('../data/processed/user_features_selected.csv')

print("Available feature sets:")
print(f"1. Full features: {user_features_full.shape}")
print(f"2. PCA features: {user_features_pca.shape}")
print(f"3. Selected features: {user_features_selected.shape}")

# Use selected features for clustering
user_features = user_features_selected.copy()
feature_cols = [col for col in user_features.columns if col != 'user_id']
X = user_features[feature_cols].values

## Optimal Number of Clusters

In [None]:
# Elbow method for K-means
inertias = []
silhouette_scores = []
k_range = range(2, 15)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X, kmeans.labels_))

# Plot elbow curve and silhouette scores
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Elbow curve
ax1.plot(k_range, inertias, 'bo-')
ax1.set_xlabel('Number of Clusters (k)')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow Method for Optimal k')
ax1.grid(True)

# Silhouette scores
ax2.plot(k_range, silhouette_scores, 'go-')
ax2.set_xlabel('Number of Clusters (k)')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Score vs k')
ax2.grid(True)

plt.tight_layout()
plt.show()

# Find optimal k based on silhouette score
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"Optimal k based on silhouette score: {optimal_k}")

## K-Means Clustering

In [None]:
# Perform K-means clustering with optimal k
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=20)
kmeans_labels = kmeans.fit_predict(X)

# Add cluster labels to dataframe
user_features['kmeans_cluster'] = kmeans_labels

# Cluster statistics
cluster_sizes = pd.Series(kmeans_labels).value_counts().sort_index()
print("K-Means Cluster Sizes:")
for cluster, size in cluster_sizes.items():
    print(f"  Cluster {cluster}: {size} users ({size/len(user_features)*100:.1f}%)")

# Clustering metrics
print(f"\nClustering Metrics:")
print(f"Silhouette Score: {silhouette_score(X, kmeans_labels):.3f}")
print(f"Davies-Bouldin Score: {davies_bouldin_score(X, kmeans_labels):.3f}")
print(f"Calinski-Harabasz Score: {calinski_harabasz_score(X, kmeans_labels):.1f}")

## Hierarchical Clustering

In [None]:
# Perform hierarchical clustering
hierarchical = AgglomerativeClustering(n_clusters=optimal_k, linkage='ward')
hierarchical_labels = hierarchical.fit_predict(X)

user_features['hierarchical_cluster'] = hierarchical_labels

# Compare with K-means
from sklearn.metrics import adjusted_rand_score
ari_score = adjusted_rand_score(kmeans_labels, hierarchical_labels)
print(f"Adjusted Rand Index between K-means and Hierarchical: {ari_score:.3f}")

# Cluster sizes
hier_cluster_sizes = pd.Series(hierarchical_labels).value_counts().sort_index()
print("\nHierarchical Cluster Sizes:")
for cluster, size in hier_cluster_sizes.items():
    print(f"  Cluster {cluster}: {size} users ({size/len(user_features)*100:.1f}%)")

## DBSCAN Clustering

In [None]:
# Find optimal DBSCAN parameters
from sklearn.neighbors import NearestNeighbors

# Calculate k-distance graph
k = 5  # minPts
nbrs = NearestNeighbors(n_neighbors=k).fit(X)
distances, indices = nbrs.kneighbors(X)
distances = np.sort(distances[:, k-1], axis=0)

# Plot k-distance graph
plt.figure(figsize=(10, 6))
plt.plot(distances)
plt.ylabel(f'{k}-NN Distance')
plt.xlabel('Points sorted by distance')
plt.title('k-distance Graph for DBSCAN')
plt.grid(True)
plt.show()

# Estimate eps from the elbow
eps = np.percentile(distances, 95)  # Use 95th percentile as eps
print(f"Suggested eps: {eps:.3f}")

In [None]:
# Perform DBSCAN clustering
dbscan = DBSCAN(eps=eps, min_samples=k)
dbscan_labels = dbscan.fit_predict(X)

user_features['dbscan_cluster'] = dbscan_labels

# DBSCAN statistics
n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
n_noise = list(dbscan_labels).count(-1)

print(f"DBSCAN Results:")
print(f"Number of clusters: {n_clusters}")
print(f"Number of noise points: {n_noise} ({n_noise/len(user_features)*100:.1f}%)")

if n_clusters > 1:
    # Calculate metrics only for non-noise points
    mask = dbscan_labels != -1
    if sum(mask) > 0:
        print(f"\nDBSCAN Metrics (excluding noise):")
        print(f"Silhouette Score: {silhouette_score(X[mask], dbscan_labels[mask]):.3f}")

## Cluster Visualization

In [None]:
# Reduce dimensions for visualization
print("Computing 2D projections for visualization...")

# PCA projection
pca_2d = PCA(n_components=2, random_state=42)
X_pca_2d = pca_2d.fit_transform(X)

# t-SNE projection (sample if too many points)
if len(X) > 5000:
    sample_idx = np.random.choice(len(X), 5000, replace=False)
    X_tsne_sample = X[sample_idx]
    labels_sample = kmeans_labels[sample_idx]
else:
    X_tsne_sample = X
    labels_sample = kmeans_labels

tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne_2d = tsne.fit_transform(X_tsne_sample)

# UMAP projection
umap_reducer = umap.UMAP(n_components=2, random_state=42)
X_umap_2d = umap_reducer.fit_transform(X)

In [None]:
# Create visualization plots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# PCA visualizations
scatter1 = axes[0, 0].scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], 
                             c=kmeans_labels, cmap='viridis', alpha=0.6)
axes[0, 0].set_title('K-Means Clusters (PCA)')
axes[0, 0].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.1%} var)')
axes[0, 0].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.1%} var)')

# t-SNE visualizations
scatter2 = axes[0, 1].scatter(X_tsne_2d[:, 0], X_tsne_2d[:, 1], 
                             c=labels_sample, cmap='viridis', alpha=0.6)
axes[0, 1].set_title('K-Means Clusters (t-SNE)')
axes[0, 1].set_xlabel('t-SNE 1')
axes[0, 1].set_ylabel('t-SNE 2')

# UMAP visualizations
scatter3 = axes[0, 2].scatter(X_umap_2d[:, 0], X_umap_2d[:, 1], 
                             c=kmeans_labels, cmap='viridis', alpha=0.6)
axes[0, 2].set_title('K-Means Clusters (UMAP)')
axes[0, 2].set_xlabel('UMAP 1')
axes[0, 2].set_ylabel('UMAP 2')

# Hierarchical clustering
scatter4 = axes[1, 0].scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], 
                             c=hierarchical_labels, cmap='plasma', alpha=0.6)
axes[1, 0].set_title('Hierarchical Clusters (PCA)')
axes[1, 0].set_xlabel(f'PC1')
axes[1, 0].set_ylabel(f'PC2')

# DBSCAN clustering
scatter5 = axes[1, 1].scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], 
                             c=dbscan_labels, cmap='coolwarm', alpha=0.6)
axes[1, 1].set_title('DBSCAN Clusters (PCA)')
axes[1, 1].set_xlabel(f'PC1')
axes[1, 1].set_ylabel(f'PC2')

# Cluster size comparison
cluster_methods = ['K-Means', 'Hierarchical', 'DBSCAN']
cluster_counts = [
    len(set(kmeans_labels)),
    len(set(hierarchical_labels)),
    len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
]
axes[1, 2].bar(cluster_methods, cluster_counts)
axes[1, 2].set_title('Number of Clusters by Method')
axes[1, 2].set_ylabel('Number of Clusters')

# Add colorbars
for ax, scatter in zip([axes[0, 0], axes[0, 1], axes[0, 2], axes[1, 0], axes[1, 1]], 
                      [scatter1, scatter2, scatter3, scatter4, scatter5]):
    plt.colorbar(scatter, ax=ax)

plt.tight_layout()
plt.show()

## Cluster Profiling

In [None]:
# Analyze cluster characteristics
# Focus on K-means clusters
cluster_profiles = user_features.groupby('kmeans_cluster')[feature_cols].mean()

# Normalize profiles to show relative differences
cluster_profiles_norm = (cluster_profiles - cluster_profiles.mean()) / cluster_profiles.std()

# Select top distinguishing features
feature_variance = cluster_profiles_norm.var()
top_features = feature_variance.nlargest(10).index.tolist()

# Create heatmap of cluster profiles
plt.figure(figsize=(12, 8))
sns.heatmap(cluster_profiles_norm[top_features].T, 
            cmap='RdBu_r', center=0, 
            cbar_kws={'label': 'Standardized Value'},
            xticklabels=[f'Cluster {i}' for i in range(optimal_k)],
            yticklabels=top_features)
plt.title('Cluster Profiles - Top Distinguishing Features')
plt.xlabel('Cluster')
plt.tight_layout()
plt.show()

In [None]:
# Detailed cluster descriptions
print("=== CLUSTER DESCRIPTIONS ===")
for cluster in range(optimal_k):
    cluster_data = cluster_profiles_norm.loc[cluster]
    
    print(f"\nCluster {cluster} ({cluster_sizes[cluster]} users):")
    
    # Top positive features
    top_positive = cluster_data.nlargest(3)
    print("  Highest features:")
    for feat, val in top_positive.items():
        print(f"    - {feat}: {val:.2f} std above average")
    
    # Top negative features
    top_negative = cluster_data.nsmallest(3)
    print("  Lowest features:")
    for feat, val in top_negative.items():
        print(f"    - {feat}: {val:.2f} std below average")

## Genre Distribution by Cluster

In [None]:
# If genre features are available, analyze genre preferences by cluster
genre_cols = [col for col in feature_cols if col.startswith('genre_')]

if genre_cols:
    # Get average genre preferences by cluster
    genre_by_cluster = user_features.groupby('kmeans_cluster')[genre_cols].mean()
    
    # Plot top genres for each cluster
    fig, axes = plt.subplots(2, int(np.ceil(optimal_k/2)), figsize=(15, 10))
    axes = axes.ravel()
    
    for cluster in range(optimal_k):
        top_genres = genre_by_cluster.loc[cluster].nlargest(5)
        
        axes[cluster].bar(range(len(top_genres)), top_genres.values)
        axes[cluster].set_xticks(range(len(top_genres)))
        axes[cluster].set_xticklabels([g.replace('genre_', '') for g in top_genres.index], 
                                      rotation=45, ha='right')
        axes[cluster].set_title(f'Cluster {cluster} - Top Genres')
        axes[cluster].set_ylabel('Average Preference')
    
    # Hide empty subplots
    for i in range(optimal_k, len(axes)):
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()
else:
    print("No genre features available for analysis")

## Save Clustering Results

In [None]:
# Save user clusters
cluster_results = user_features[['user_id', 'kmeans_cluster', 'hierarchical_cluster', 'dbscan_cluster']]
cluster_results.to_csv('../data/processed/user_clusters.csv', index=False)

# Save cluster profiles
cluster_profiles.to_csv('../data/processed/cluster_profiles.csv')

# Save clustering models
import joblib
joblib.dump(kmeans, '../models/kmeans_model.pkl')
joblib.dump(hierarchical, '../models/hierarchical_model.pkl')
joblib.dump(dbscan, '../models/dbscan_model.pkl')

# Save 2D projections for visualization
projections = pd.DataFrame({
    'user_id': user_features['user_id'],
    'pca_1': X_pca_2d[:, 0],
    'pca_2': X_pca_2d[:, 1],
    'umap_1': X_umap_2d[:, 0],
    'umap_2': X_umap_2d[:, 1]
})
projections.to_csv('../data/processed/cluster_projections.csv', index=False)

print("Clustering results saved!")
print("\nSaved files:")
print("- user_clusters.csv: User cluster assignments")
print("- cluster_profiles.csv: Average feature values by cluster")
print("- cluster_projections.csv: 2D projections for visualization")
print("\nSaved models:")
print("- kmeans_model.pkl")
print("- hierarchical_model.pkl")
print("- dbscan_model.pkl")

## Clustering Summary

In [None]:
# Create summary report
print("=== CLUSTERING ANALYSIS SUMMARY ===")
print(f"\nDataset: {len(user_features)} users, {len(feature_cols)} features")
print(f"\nOptimal number of clusters: {optimal_k}")

print("\n=== METHOD COMPARISON ===")
methods_comparison = pd.DataFrame({
    'Method': ['K-Means', 'Hierarchical', 'DBSCAN'],
    'Clusters': [optimal_k, optimal_k, n_clusters],
    'Silhouette': [
        silhouette_score(X, kmeans_labels),
        silhouette_score(X, hierarchical_labels),
        silhouette_score(X[dbscan_labels != -1], dbscan_labels[dbscan_labels != -1]) if n_clusters > 1 else np.nan
    ]
})
print(methods_comparison.to_string(index=False))

print("\n=== CLUSTER CHARACTERISTICS ===")
print("K-Means clusters show distinct patterns in:")
for i, feat in enumerate(feature_variance.nlargest(5).index):
    print(f"  {i+1}. {feat}")

print("\n=== RECOMMENDATIONS ===")
print(f"- Use K-Means with {optimal_k} clusters for production")
print(f"- Clusters are well-separated (silhouette score: {silhouette_score(X, kmeans_labels):.3f})")
print(f"- Focus on top {len(top_features)} features for interpretation")