In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import (KMeans, AgglomerativeClustering, DBSCAN, 
                           SpectralClustering, GaussianMixture, MeanShift, 
                           AffinityPropagation, OPTICS, Birch)
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import (silhouette_score, adjusted_rand_score, 
                           normalized_mutual_info_score, calinski_harabasz_score,
                           davies_bouldin_score)
from sklearn.neighbors import NearestNeighbors
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist, squareform
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Creating the DataFrame
data = {
    'Car Model': ['Toyota Aygo', 'Mitsubishi Space Star', 'Skoda Citigo', 'Fiat 500', 'Mini Cooper', 'VW Up!', 'Skoda Fabia', 
                  'Mercedes A-Class', 'Ford Fiesta', 'Audi A1', 'Hyundai I20', 'Suzuki Swift', 'Ford Fiesta', 'Honda Civic', 
                  'Hyundai I30', 'Opel Astra', 'BMW 1', 'Mazda 3', 'Skoda Rapid', 'Ford Focus', 'Ford Mondeo', 'Opel Insignia', 
                  'Mercedes C-Class', 'Skoda Octavia', 'Volvo S60', 'Mercedes CLA', 'Audi A4', 'Audi A6', 'Volvo V70', 
                  'BMW 5', 'Mercedes E-Class', 'Volvo XC70', 'Ford B-Max', 'BMW 2', 'Opel Zafira', 'Mercedes SLK'],
    'Volume': [1000, 1200, 1000, 900, 1500, 1000, 1400, 1500, 1500, 1600, 1100, 1300, 1000, 1600, 1600, 1600, 1600, 2200, 1600, 2000,
               1600, 2000, 2100, 1600, 2000, 1500, 2000, 2000, 1600, 2000, 2100, 1600, 1600, 1600, 2500, 1800],
    'Weight': [790, 1160, 929, 865, 1140, 929, 1109, 1365, 1112, 1150, 980, 990, 1112, 1252, 1326, 1330, 1365, 1280, 1119, 1328,
               1584, 1428, 1365, 1415, 1415, 1465, 1490, 1725, 1523, 1705, 1605, 1746, 1235, 1390, 1405, 1395],
    'CO2': [99, 95, 95, 90, 105, 105, 90, 92, 98, 99, 99, 101, 99, 94, 97, 97, 99, 104, 104, 105,
            94, 99, 99, 99, 99, 102, 104, 114, 109, 114, 115, 117, 104, 108, 109, 120]
}

df = pd.DataFrame(data)

print("="*80)
print("COMPREHENSIVE CLUSTERING ANALYSIS FOR CAR DATA")
print("="*80)
print(f"Dataset shape: {df.shape}")
print(f"Features for clustering: Volume, Weight, CO2")
print("\nDataset summary:")
print(df[['Volume', 'Weight', 'CO2']].describe())

# Prepare data for clustering
features = ['Volume', 'Weight', 'CO2']
X = df[features].values

# Data scaling
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

X_standard = scaler_standard.fit_transform(X)
X_minmax = scaler_minmax.fit_transform(X)

print(f"\nOriginal data shape: {X.shape}")
print("Data preprocessing completed (StandardScaler and MinMaxScaler)")

# =============================================================================
# 1. OPTIMAL NUMBER OF CLUSTERS ANALYSIS
# =============================================================================

print("\n" + "="*80)
print("1. OPTIMAL NUMBER OF CLUSTERS ANALYSIS")
print("="*80)

def plot_elbow_and_silhouette():
    """Plot elbow method and silhouette analysis"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Elbow Method for K-Means
    K_range = range(2, 11)
    inertias = []
    silhouette_scores = []
    
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X_standard)
        inertias.append(kmeans.inertia_)
        silhouette_scores.append(silhouette_score(X_standard, kmeans.labels_))
    
    # Elbow curve
    axes[0, 0].plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
    axes[0, 0].set_xlabel('Number of Clusters (k)')
    axes[0, 0].set_ylabel('Inertia (Within-cluster sum of squares)')
    axes[0, 0].set_title('Elbow Method for Optimal k')
    axes[0, 0].grid(True, alpha=0.3)
    
    # Silhouette scores
    axes[0, 1].plot(K_range, silhouette_scores, 'ro-', linewidth=2, markersize=8)
    axes[0, 1].set_xlabel('Number of Clusters (k)')
    axes[0, 1].set_ylabel('Silhouette Score')
    axes[0, 1].set_title('Silhouette Analysis')
    axes[0, 1].grid(True, alpha=0.3)
    
    # Calinski-Harabasz Index
    ch_scores = []
    db_scores = []
    
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(X_standard)
        ch_scores.append(calinski_harabasz_score(X_standard, labels))
        db_scores.append(davies_bouldin_score(X_standard, labels))
    
    axes[1, 0].plot(K_range, ch_scores, 'go-', linewidth=2, markersize=8)
    axes[1, 0].set_xlabel('Number of Clusters (k)')
    axes[1, 0].set_ylabel('Calinski-Harabasz Index')
    axes[1, 0].set_title('Calinski-Harabasz Index (Higher is Better)')
    axes[1, 0].grid(True, alpha=0.3)
    
    # Davies-Bouldin Index
    axes[1, 1].plot(K_range, db_scores, 'mo-', linewidth=2, markersize=8)
    axes[1, 1].set_xlabel('Number of Clusters (k)')
    axes[1, 1].set_ylabel('Davies-Bouldin Index')
    axes[1, 1].set_title('Davies-Bouldin Index (Lower is Better)')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Find optimal k
    optimal_k_silhouette = K_range[np.argmax(silhouette_scores)]
    optimal_k_ch = K_range[np.argmax(ch_scores)]
    optimal_k_db = K_range[np.argmin(db_scores)]
    
    print(f"Optimal k based on Silhouette Score: {optimal_k_silhouette}")
    print(f"Optimal k based on Calinski-Harabasz Index: {optimal_k_ch}")
    print(f"Optimal k based on Davies-Bouldin Index: {optimal_k_db}")
    
    return optimal_k_silhouette

optimal_k = plot_elbow_and_silhouette()

# =============================================================================
# 2. CLUSTERING ALGORITHMS IMPLEMENTATION
# =============================================================================

print(f"\n" + "="*80)
print("2. IMPLEMENTING MULTIPLE CLUSTERING ALGORITHMS")
print("="*80)

clustering_results = {}

# Define clustering algorithms
clustering_algorithms = {
    'K-Means': KMeans(n_clusters=optimal_k, random_state=42, n_init=10),
    'K-Means (k=3)': KMeans(n_clusters=3, random_state=42, n_init=10),
    'K-Means (k=4)': KMeans(n_clusters=4, random_state=42, n_init=10),
    'Agglomerative (Ward)': AgglomerativeClustering(n_clusters=optimal_k, linkage='ward'),
    'Agglomerative (Complete)': AgglomerativeClustering(n_clusters=optimal_k, linkage='complete'),
    'Agglomerative (Average)': AgglomerativeClustering(n_clusters=optimal_k, linkage='average'),
    'DBSCAN (eps=0.5)': DBSCAN(eps=0.5, min_samples=3),
    'DBSCAN (eps=0.8)': DBSCAN(eps=0.8, min_samples=3),
    'DBSCAN (eps=1.0)': DBSCAN(eps=1.0, min_samples=3),
    'Spectral Clustering': SpectralClustering(n_clusters=optimal_k, random_state=42),
    'Gaussian Mixture': GaussianMixture(n_components=optimal_k, random_state=42),
    'Mean Shift': MeanShift(),
    'Affinity Propagation': AffinityPropagation(random_state=42),
    'OPTICS': OPTICS(min_samples=3),
    'Birch': Birch(n_clusters=optimal_k)
}

print("Applying clustering algorithms...")

for name, algorithm in clustering_algorithms.items():
    try:
        if 'DBSCAN' in name or name in ['Mean Shift', 'Affinity Propagation', 'OPTICS']:
            labels = algorithm.fit_predict(X_standard)
        else:
            labels = algorithm.fit_predict(X_standard)
        
        # Calculate metrics (skip if only one cluster or noise points)
        unique_labels = np.unique(labels)
        n_clusters = len(unique_labels) - (1 if -1 in labels else 0)
        
        if n_clusters > 1 and len(unique_labels) > 1:
            silhouette = silhouette_score(X_standard, labels)
            if n_clusters > 1:
                ch_score = calinski_harabacz_score(X_standard, labels)
                db_score = davies_bouldin_score(X_standard, labels)
            else:
                ch_score = 0
                db_score = float('inf')
        else:
            silhouette = -1
            ch_score = 0
            db_score = float('inf')
        
        clustering_results[name] = {
            'labels': labels,
            'n_clusters': n_clusters,
            'silhouette_score': silhouette,
            'calinski_harabasz_score': ch_score,
            'davies_bouldin_score': db_score,
            'algorithm': algorithm
        }
        
        print(f"{name:25s} | Clusters: {n_clusters:2d} | Silhouette: {silhouette:6.3f}")
        
    except Exception as e:
        print(f"Error with {name}: {str(e)}")

# =============================================================================
# 3. CLUSTERING RESULTS COMPARISON
# =============================================================================

print(f"\n" + "="*80)
print("3. CLUSTERING RESULTS COMPARISON")
print("="*80)

# Create comparison dataframe
comparison_data = []
for name, results in clustering_results.items():
    comparison_data.append({
        'Algorithm': name,
        'Clusters': results['n_clusters'],
        'Silhouette': results['silhouette_score'],
        'Calinski-Harabasz': results['calinski_harabasz_score'],
        'Davies-Bouldin': results['davies_bouldin_score']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Silhouette', ascending=False)

print("Clustering Algorithm Comparison:")
print("="*70)
print(f"{'Algorithm':<25} {'Clusters':<8} {'Silhouette':<12} {'C-H Index':<12} {'D-B Index':<10}")
print("-"*70)
for _, row in comparison_df.iterrows():
    print(f"{row['Algorithm']:<25} {row['Clusters']:<8} {row['Silhouette']:<12.3f} {row['Calinski-Harabasz']:<12.1f} {row['Davies-Bouldin']:<10.3f}")

# =============================================================================
# 4. VISUALIZATION OF CLUSTERING RESULTS
# =============================================================================

print(f"\n" + "="*80)
print("4. CLUSTERING VISUALIZATIONS")
print("="*80)

# Select top performing algorithms for visualization
top_algorithms = comparison_df.head(8)['Algorithm'].tolist()

def plot_clustering_results():
    """Create comprehensive clustering visualizations"""
    
    # 1. 3D Scatter plots
    fig = plt.figure(figsize=(20, 15))
    
    for i, algo_name in enumerate(top_algorithms):
        ax = fig.add_subplot(2, 4, i+1, projection='3d')
        labels = clustering_results[algo_name]['labels']
        
        # Create color map
        unique_labels = np.unique(labels)
        colors = plt.cm.tab10(np.linspace(0, 1, len(unique_labels)))
        
        for label, color in zip(unique_labels, colors):
            if label == -1:  # Noise points
                mask = labels == label
                ax.scatter(X[mask, 0], X[mask, 1], X[mask, 2], 
                          c='black', marker='x', s=50, alpha=0.6, label='Noise')
            else:
                mask = labels == label
                ax.scatter(X[mask, 0], X[mask, 1], X[mask, 2], 
                          c=[color], s=50, alpha=0.7, label=f'Cluster {label}')
        
        ax.set_xlabel('Volume')
        ax.set_ylabel('Weight')
        ax.set_zlabel('CO2')
        ax.set_title(f'{algo_name}\n({clustering_results[algo_name]["n_clusters"]} clusters)')
        
        if len(unique_labels) <= 6:  # Only show legend if not too many clusters
            ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.show()
    
    # 2. 2D Projections
    fig, axes = plt.subplots(3, 3, figsize=(18, 15))
    axes = axes.flatten()
    
    feature_pairs = [('Volume', 'Weight'), ('Volume', 'CO2'), ('Weight', 'CO2')]
    
    for i, algo_name in enumerate(top_algorithms[:9]):
        ax = axes[i]
        labels = clustering_results[algo_name]['labels']
        
        # Plot Volume vs Weight
        unique_labels = np.unique(labels)
        colors = plt.cm.tab10(np.linspace(0, 1, len(unique_labels)))
        
        for label, color in zip(unique_labels, colors):
            if label == -1:
                mask = labels == label
                ax.scatter(df.loc[mask, 'Volume'], df.loc[mask, 'Weight'], 
                          c='black', marker='x', s=30, alpha=0.6)
            else:
                mask = labels == label
                ax.scatter(df.loc[mask, 'Volume'], df.loc[mask, 'Weight'], 
                          c=[color], s=30, alpha=0.7)
        
        ax.set_xlabel('Volume')
        ax.set_ylabel('Weight')
        ax.set_title(f'{algo_name}')
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

plot_clustering_results()

# =============================================================================
# 5. HIERARCHICAL CLUSTERING DENDROGRAM
# =============================================================================

print(f"\n" + "="*80)
print("5. HIERARCHICAL CLUSTERING ANALYSIS")
print("="*80)

def plot_dendrograms():
    """Plot dendrograms for different linkage methods"""
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    linkage_methods = ['ward', 'complete', 'average', 'single']
    
    for i, method in enumerate(linkage_methods):
        ax = axes[i//2, i%2]
        
        # Calculate linkage
        if method == 'ward':
            linkage_matrix = linkage(X_standard, method=method)
        else:
            linkage_matrix = linkage(X_standard, method=method, metric='euclidean')
        
        # Plot dendrogram
        dendrogram(linkage_matrix, ax=ax, truncate_mode='level', p=5)
        ax.set_title(f'Dendrogram - {method.capitalize()} Linkage')
        ax.set_xlabel('Sample Index or (Cluster Size)')
        ax.set_ylabel('Distance')
    
    plt.tight_layout()
    plt.show()

plot_dendrograms()

# =============================================================================
# 6. PCA VISUALIZATION
# =============================================================================

print(f"\n" + "="*80)
print("6. PCA-BASED CLUSTERING VISUALIZATION")
print("="*80)

# Apply PCA for 2D visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_standard)

print(f"PCA Explained Variance Ratio: {pca.explained_variance_ratio_}")
print(f"Total Variance Explained: {sum(pca.explained_variance_ratio_):.3f}")

def plot_pca_clusters():
    """Plot clustering results in PCA space"""
    fig, axes = plt.subplots(2, 4, figsize=(20, 10))
    axes = axes.flatten()
    
    for i, algo_name in enumerate(top_algorithms):
        ax = axes[i]
        labels = clustering_results[algo_name]['labels']
        
        unique_labels = np.unique(labels)
        colors = plt.cm.tab10(np.linspace(0, 1, len(unique_labels)))
        
        for label, color in zip(unique_labels, colors):
            if label == -1:
                mask = labels == label
                ax.scatter(X_pca[mask, 0], X_pca[mask, 1], 
                          c='black', marker='x', s=30, alpha=0.6, label='Noise')
            else:
                mask = labels == label
                ax.scatter(X_pca[mask, 0], X_pca[mask, 1], 
                          c=[color], s=30, alpha=0.7, label=f'C{label}')
        
        ax.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2f})')
        ax.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2f})')
        ax.set_title(f'{algo_name}')
        ax.grid(True, alpha=0.3)
        
        if len(unique_labels) <= 5:
            ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.show()

plot_pca_clusters()

# =============================================================================
# 7. DETAILED ANALYSIS OF BEST CLUSTERING
# =============================================================================

print(f"\n" + "="*80)
print("7. DETAILED ANALYSIS OF BEST CLUSTERING ALGORITHM")
print("="*80)

# Select best algorithm based on silhouette score
best_algo_name = comparison_df.iloc[0]['Algorithm']
best_results = clustering_results[best_algo_name]
best_labels = best_results['labels']

print(f"Best Algorithm: {best_algo_name}")
print(f"Number of Clusters: {best_results['n_clusters']}")
print(f"Silhouette Score: {best_results['silhouette_score']:.3f}")

# Add cluster labels to dataframe
df_clustered = df.copy()
df_clustered['Cluster'] = best_labels

print(f"\nCluster Distribution:")
cluster_counts = pd.Series(best_labels).value_counts().sort_index()
for cluster, count in cluster_counts.items():
    if cluster == -1:
        print(f"Noise Points: {count}")
    else:
        print(f"Cluster {cluster}: {count} cars")

print(f"\nCluster Statistics:")
print("="*50)
for cluster in sorted(np.unique(best_labels)):
    if cluster != -1:
        cluster_data = df_clustered[df_clustered['Cluster'] == cluster]
        print(f"\nCluster {cluster} ({len(cluster_data)} cars):")
        print(f"  Volume: {cluster_data['Volume'].mean():.1f} ± {cluster_data['Volume'].std():.1f}")
        print(f"  Weight: {cluster_data['Weight'].mean():.1f} ± {cluster_data['Weight'].std():.1f}")
        print(f"  CO2: {cluster_data['CO2'].mean():.1f} ± {cluster_data['CO2'].std():.1f}")
        print(f"  Cars: {', '.join(cluster_data['Car Model'].tolist()[:5])}" + 
              ('...' if len(cluster_data) > 5 else ''))

# =============================================================================
# 8. CLUSTER PROFILING AND INSIGHTS
# =============================================================================

print(f"\n" + "="*80)
print("8. CLUSTER PROFILING AND BUSINESS INSIGHTS")
print("="*80)

def analyze_clusters():
    """Provide business insights from clustering"""
    
    # Create cluster profiles
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Box plots for each feature by cluster
    features_to_plot = ['Volume', 'Weight', 'CO2']
    
    for i, feature in enumerate(features_to_plot):
        if i < 3:
            ax = axes[i//2, i%2]
            df_clustered.boxplot(column=feature, by='Cluster', ax=ax)
            ax.set_title(f'{feature} Distribution by Cluster')
            ax.set_xlabel('Cluster')
            ax.set_ylabel(feature)
    
    # Correlation heatmap for the best clustering
    ax = axes[1, 1]
    cluster_means = df_clustered.groupby('Cluster')[['Volume', 'Weight', 'CO2']].mean()
    sns.heatmap(cluster_means.T, annot=True, cmap='viridis', ax=ax)
    ax.set_title('Cluster Centroids Heatmap')
    
    plt.tight_layout()
    plt.show()
    
    # Business insights
    print("BUSINESS INSIGHTS:")
    print("="*40)
    
    for cluster in sorted(np.unique(best_labels)):
        if cluster != -1:
            cluster_data = df_clustered[df_clustered['Cluster'] == cluster]
            avg_volume = cluster_data['Volume'].mean()
            avg_weight = cluster_data['Weight'].mean()
            avg_co2 = cluster_data['CO2'].mean()
            
            print(f"\nCluster {cluster} Profile:")
            if avg_volume < 1300 and avg_weight < 1100:
                print("  → Compact/Economy Cars: Small engine, lightweight, eco-friendly")
            elif avg_volume > 1800 and avg_weight > 1500:
                print("  → Large/Luxury Cars: High-performance, heavier, higher emissions")
            else:
                print("  → Mid-size Cars: Balanced performance and efficiency")
            
            print(f"  → Target Market: {'Economy-conscious' if avg_co2 < 100 else 'Performance-oriented' if avg_co2 > 110 else 'Mainstream'}")
            print(f"  → Environmental Impact: {'Low' if avg_co2 < 100 else 'High' if avg_co2 > 110 else 'Moderate'}")

analyze_clusters()

print(f"\n" + "="*80)
print("CLUSTERING ANALYSIS COMPLETED!")
print("="*80)
print(f"Total algorithms tested: {len(clustering_algorithms)}")
print(f"Best performing algorithm: {best_algo_name}")
print(f"Optimal number of clusters: {best_results['n_clusters']}")
print(f"Best silhouette score: {best_results['silhouette_score']:.3f}")
print("="*80)

ImportError: cannot import name 'GaussianMixture' from 'sklearn.cluster' (C:\Users\ajaym\AppData\Roaming\Python\Python312\site-packages\sklearn\cluster\__init__.py)