In [None]:
'''14. Perform hierarchical clustering such as Agglomerative algorithm and Divisive algorithm to group several vehicles. Utilize single, complete, and average linkage to define the cluster. Also draw the dendrogram for this problem.
Dataset: Vehicle dataset
'''
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# --- DATA PREP ---
# Load the provided vehicle dataset
vehicle = pd.read_csv('vehicle_dataset.csv')
# Use only numeric columns for clustering
data = vehicle.select_dtypes(include=np.number).dropna()

# Standardize numeric data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# --- AGGLOMERATIVE CLUSTERING AND DENDROGRAMS ---
plt.figure(figsize=(18, 5))
link_methods = ['single', 'complete', 'average']
for i, method in enumerate(link_methods):
    plt.subplot(1, 3, i + 1)
    Z = linkage(data_scaled, method=method)
    dendrogram(Z)
    plt.title(f'Dendrogram ({method.capitalize()} Linkage)')
    plt.xlabel('Sample Index')
    plt.ylabel('Distance')
plt.tight_layout()
plt.show()

# --- AGGLOMERATIVE CLUSTER STRIP PLOT (average linkage, 2 clusters) ---
Z = linkage(data_scaled, method='average')
labels_agg = fcluster(Z, t=2, criterion='maxclust') - 1
plt.scatter(range(len(labels_agg)), labels_agg, c=labels_agg, cmap='Accent')
plt.title('Agglomerative Partition (avg linkage)')
plt.xlabel('Sample Index')
plt.ylabel('Partition')
plt.show()

# --- PSEUDO-DIVISIVE CLUSTER STRIP PLOT (simulated DIANA, 2 clusters) ---
def pseudo_divisive(X, n_clusters=2):
    n = len(X)
    partitions = np.zeros(n, dtype=int)
    cluster_indices = [np.arange(n)]
    label = 1
    while len(cluster_indices) < n_clusters:
        largest = max(cluster_indices, key=len)
        if len(largest) <= 2:
            break
        cluster_indices.remove(largest)
        km = KMeans(n_clusters=2, random_state=0).fit(X[largest])
        mask = km.labels_
        idx0, idx1 = largest[mask == 0], largest[mask == 1]
        cluster_indices.extend([idx0, idx1])
        partitions[idx1] = label
        label += 1
    return partitions

labels_div = pseudo_divisive(data_scaled, n_clusters=2)
plt.scatter(range(len(labels_div)), labels_div, c=labels_div, cmap='Accent')
plt.title('Divisive Partition (pseudo top-down)')
plt.xlabel('Sample Index')
plt.ylabel('Partition')
plt.show()
