# Data preparation

In [None]:
import numpy as np
import pandas as pd

In [None]:
############################################ Task 1 ############################################
# Load the data
# ----------------------------------------- start here -----------------------------------------

# Load wine dataset from sklearn
from sklearn.datasets import load_wine
wine = ...
X = ...
y = ...

In [None]:
# Import the whiten function
from scipy.cluster.vq import whiten

In [None]:
# Create dataframe from Wine dataset
df = pd.DataFrame(X, columns = wine.feature_names)

In [None]:
############################################ Task 1 ############################################
# use whiten function from scipy.cluster.vq to standardize the data
# ----------------------------------------- start here -----------------------------------------

# Copy the feature names
whiten_cols = ...

# Use the whiten() function to standardize the data
df.loc[:, whiten_cols] = whiten(...)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

############################################ Task 1 ############################################
# Dispaly the dataset with pairplot from seaborn, showing only the first three features
# ----------------------------------------- start here -----------------------------------------

sns.pairplot(..., kind='scatter', diag_kind="kde", height=1.5, plot_kws={'alpha':0.3})

# Hierarchical Clustering

## Algorithm

* #### Let each data point be a cluster and compute the proximity matrix
* #### Merge the two closest clusters and update the proximity matrix, until only a single cluster remains

In [None]:
# Import linkage and fcluster functions
from scipy.cluster.hierarchy import linkage, fcluster

#### Linkage types

<img src="linkage_types.png" width="700"/>

* #### Single-Linkage: $dist(C_i,C_j):=\min\{dist(x_i,x_j): x_i \in C_i, x_j \in C_j\}$

* #### Complete-Linkage: $dist(C_i,C_j):=\max\{dist(x_i,x_j): x_i \in C_i, x_j \in C_j\}$

* #### Average-Linkage: $dist(C_i,C_j):=\frac{1}{|C_i||C_j|} \sum_{x_i \in C_i, x_j \in C_j} dist(x_i,x_j)$

* #### Ward: 

Lecture: $dist(C_i,C_j):=\frac{|C_i||C_j|}{|C_i|+|C_j|} dist(\bar{x}_i,\bar{x}_j)^2$, $\bar{x}_i = \frac{1}{|C_i|}\sum_{x_k\in C_i}x_k$

Scipy documentation:
$dist(C_i,C_j):=\sqrt{\frac{|C_j|+|s|}{T}dist(C_j,s)^2 + \frac{|C_j|+|t|}{T}dist(C_j,t)^2 - \frac{|C_j|}{T}dist(s,t)^2}$,

where $C_i$ is the newly joined cluster consisting of clusters $s$ and $t$, $C_j$ is an unused cluster in the forest, $T = |C_j|+|s|+|t|$, and $|*|$ is the cardinality of its argument. 

In [None]:
############################################ Task 2 ############################################
# Perform hierarchical clustering using the linkage methods single
# ----------------------------------------- start here -----------------------------------------

# Use the linkage() function to compute distances
Z_single = linkage(..., ...)

# Generate cluster labels and add them to dataframe for 'single' method
df_single = df.copy()
df_single['labels_single'] = fcluster(..., 3, criterion='maxclust')

In [None]:
############################################ Task 2 ############################################
# Perform hierarchical clustering using the linkage methods complete
# ----------------------------------------- start here -----------------------------------------

# Use the linkage() function to compute distances
Z_complete = linkage(..., ...)

# Generate cluster labels and add them to dataframe for 'complete' method
df_complete = df.copy()
df_complete['labels_complete'] = fcluster(..., 3, criterion='maxclust')

In [None]:
############################################ Task 2 ############################################
# Perform hierarchical clustering using the linkage methods centroid
# ----------------------------------------- start here -----------------------------------------

# Use the linkage() function to compute distances
Z_centroid = linkage(..., ...)

# Generate cluster labels and add them to dataframe for 'centroid' method
df_centroid = df.copy()
df_centroid['labels_centroid'] = fcluster(..., 3, criterion='maxclust')

In [None]:
############################################ Task 2 ############################################
# Perform hierarchical clustering using the linkage methods ward
# ----------------------------------------- start here -----------------------------------------

# Use the linkage() function to compute distances
Z_ward = linkage(..., ...)

# Generate cluster labels and add them to dataframe for 'ward' method
df_ward = df.copy()
df_ward['labels_ward'] = fcluster(..., 3, criterion='maxclust')

In [None]:
############################################ Task 2 ############################################
# Visualize the resulting clusters for different linkage methods
# ----------------------------------------- start here -----------------------------------------

# Plot preditions for the 'single' linkage method
columns = ['alcohol','malic_acid','ash',...]
single=sns.pairplot(..., hue=..., height=1.5)
single.fig.suptitle("single method")

# Plot preditions for the 'complete' linkage method
columns = ['alcohol','malic_acid','ash',...]
complete=sns.pairplot(..., hue=..., height=1.5)
complete.fig.suptitle("complete method")

# Plot preditions for the 'centroid' linkage method
columns = ['alcohol','malic_acid','ash',...]
centroid=sns.pairplot(..., hue=..., height=1.5)
centroid.fig.suptitle("centroid method")

# Plot preditions for the 'ward' linkage method
columns = ['alcohol','malic_acid','ash',...]
ward=sns.pairplot(..., hue=..., height=1.5)
ward.fig.suptitle("ward method")

## How many clusters?

### Dendrograms

In [None]:
############################################ Task 2 ############################################
# Display the dendograms for the linkage methods single, complete, centroid and ward
# ----------------------------------------- start here -----------------------------------------

# Import the dendrogram function
from scipy.cluster.hierarchy import dendrogram
fig=plt.figure(figsize=(15,20))

# Create dendrogram for the linkage method single
plt.subplot(4, 1, 1)
dn = dendrogram(...)
plt.grid(axis='y')

# Create dendrogram for the linkage method complete
plt.subplot(4, 1, 2)
dn = dendrogram(...)
plt.grid(axis='y')

# Create dendrogram for the linkage method centroid
plt.subplot(4, 1, 3)
dn = dendrogram(...)
plt.grid(axis='y')

# Create dendrogram for the linkage method ward
plt.subplot(4, 1, 4)
dn = dendrogram(...)
plt.grid(axis='y')

# Display the dendogram
plt.show()

# Comparison of methods in linkage

In [None]:
from sklearn.metrics import calinski_harabasz_score

In [None]:
############################################ Task 2 ############################################
# Compare the Calinski-Harabasz index for the linkage methods single, complete, centroid and ward
# ----------------------------------------- start here -----------------------------------------

# list of different methods
methods = ['single', 'complete', 'centroid', 'ward'] 
  
# List of the calinski-harabasz score
scores = [] 
  
# Evaluating the performance
scores.append(calinski_harabasz_score(..., ...))
scores.append(calinski_harabasz_score(..., ...))
scores.append(calinski_harabasz_score(..., ...))
scores.append(calinski_harabasz_score(..., ...))
  
# Plotting a Bar Graph to compare the methods
plt.bar(..., ...) 
plt.xlabel('methods') 
plt.ylabel('Calinski Harabasz Score')
plt.grid(alpha=0.5)
plt.title('Comparison of methods in linkage')
plt.show() 
print(scores)

# K-Means Clustering

## Algorithm

* #### Choose the number of clusters k

* #### Randomly select the centroid for each cluster

* #### Assign each point to the closest cluster centroid

* #### Compute the centroids of newly formed clusters

* #### Repeat the previous two steps, until there is no change to the centroids

In [None]:
############################################ Task 3 ############################################
# Perform k-means clustering
# ----------------------------------------- start here -----------------------------------------

# Import kmeans and vq functions
from scipy.cluster.vq import kmeans, vq

# Compute cluster centers
centroids, _ = ...

# Assign cluster labels
df_kmeans = df.copy()
df_kmeans['pred_labels'], _ = ...

# Plot the data points with seaborn, showing only the first three features
columns = ['alcohol','malic_acid','ash',...]
sns.pairplot(..., hue=..., height=1.5)

In [None]:
############################################ Task 3 ############################################
# Visualize the dataset with true labels, showing only the first three features
# ----------------------------------------- start here -----------------------------------------
df_kmeans['true_labels'] = ...
columns = ['alcohol','malic_acid','ash',...]
single=sns.pairplot(..., hue=..., height=1.5)

In [None]:
############################################ Task 3 ############################################
# Compare the predicted labels with the true labels.
# ----------------------------------------- start here -----------------------------------------

# Compare the true label and predicted label
import pandas as pd
df_kmeans = df_kmeans.replace({'pred_labels': {0:2, 1:1, 2:0}})
ct = pd.crosstab(..., ...)
print(ct)

## How many clusters?

### Elbow method
* #### Elbow plot: plot of the number of clusters and distortion
* #### Elbow plot helps indicate number of clusters present in data

Distortion := $\frac{1}{|s|}\sum_{i=1}^k\sum_{x_i \in C_i} dist(x_i, \mu_i(C_i))$,

where $|s|$ the number of observations.

In [None]:
############################################ Task 3 ############################################
# Use the elbow method to determine the optimal number of clusters for k-means clustering
# ----------------------------------------- start here -----------------------------------------

distortions = []
num_clusters = range(1, 7)

# Create a list of distortions from the kmeans function
for i in num_clusters:
    cluster_centers, distortion = kmeans(..., ...)
    distortions.append(distortion)

# Create a DataFrame with two lists - num_clusters, distortions
elbow_plot = pd.DataFrame({'num_clusters': num_clusters, 'distortions': distortions})

# Creat a line plot of num_clusters and distortions
plt.figure(figsize=(6,3))
sns.lineplot(x='num_clusters', y='distortions', data = ...)
plt.xticks(num_clusters)
plt.show()

# Spectral Clustering

## Algorithm

* #### Compute unnormalized Graph Laplacian $L$

* #### Compute eigenvectors $U = [u_1,\cdots,u_k]$ to $k$ smallest eigenvalues of $L$

* #### Set $v_i = (U_{i,1},\cdots,U_{i,k}) \in \mathbb{R}^k$ as rows of $U$

* #### Cluster the $v_i$ using $k$-means and output found clusters $C_1,\cdots,C_k$

#### Nearest Neighbors: 
The affinity matrix is constructed using a k-nearest neighbors connectivity matrix.

#### RBF (Radial Basis Function): 
The affinity matrix is constructed using a kernel function such as the gaussian kernel $(W_{i,j}=\exp(-dist(x_i,x_j)^2)/\sigma^2)$.

In [None]:
############################################ Task 4 ############################################
# Perform spectral clustering
# ----------------------------------------- start here -----------------------------------------

# Import SpectralClustering function
from sklearn.cluster import SpectralClustering 

# Building the clustering model 
spectral_model = SpectralClustering(..., ...)

# Training the model and Storing the predicted cluster labels 
df_sc = df.copy()
df_sc['pred_labels'] = ...

# Plot the data points with seaborn, showing only the first three features
columns = ['alcohol','malic_acid','ash',...]
sns.pairplot(..., hue=..., height=1.5)

In [None]:
# Compare the true label and predicted label
import pandas as pd
df_sc['ture_labels'] = ...
df_sc = df_sc.replace({'pred_labels': {0:0, 1:2, 2:1}})
ct = pd.crosstab(..., ...)
print(ct)

## How many clusters?

In [None]:
############################################ Task 4 ############################################
# Use the Calinski-Harabasz scores to determine the optimal number of clusters for spectral clustering
# ----------------------------------------- start here -----------------------------------------

scores = []
num_clusters = range(2, 7)
df_sc_temp = df.copy()

# Calculate the calinski-harabasz scores of spectral clustering with different numbers of clusters
for i in num_clusters:
    # Building the clustering model
    spectral_model_temp = SpectralClustering(..., ...)
    
    # Training the model and storing the predicted cluster labels 
    df_sc_temp['pred_labels'] = ...
    
    # Add the calinski-harabasz score to scores
    scores.append(calinski_harabasz_score(..., ...))

# Create a DataFrame with two lists - num_clusters, scores
df_plot = pd.DataFrame({'num_clusters': num_clusters, 'scores': scores})

# Creat a line plot of num_clusters and scores
plt.figure(figsize=(6,3))
sns.lineplot(x='num_clusters', y='scores', data=...)
plt.xticks(num_clusters)
plt.show()

# Comparison of models

In [None]:
############################################ Task 5 ############################################
# Compare the Silhouette scores for hierarchical clustering with the 'ward' linkage method, k-means clustering and spectral clustering.
# ----------------------------------------- start here -----------------------------------------

# Import Silhouette score
from sklearn.metrics import silhouette_score

# list of different models
models = ['Ward Hierarchical Clustering', 'Kmeans', 'Spectral Clustering'] 
  
# List of scores
scores = [] 
  
# Evaluating the performance 
scores.append(silhouette_score(..., ...))
scores.append(silhouette_score(..., ...))
scores.append(silhouette_score(..., ...))
  
# Plotting a Bar Graph to compare the models 
plt.bar(..., ...) 
plt.xlabel('Models') 
plt.ylabel('Silhouette Score')
plt.grid(alpha=0.5)
plt.title('Comparison of different Clustering Models')
plt.show() 
print(scores)

In [None]:
############################################ Task 5 ############################################
# Compare the Calinski-Harabasz scores for hierarchical clustering with the 'ward' linkage method, k-means clustering and spectral clustering.
# ----------------------------------------- start here -----------------------------------------

# list of different models
models = ['Ward Hierarchical Clustering', 'Kmeans', 'Spectral Clustering'] 
  
# List of scores
scores = [] 
  
# Evaluating the performance 
scores.append(calinski_harabasz_score(..., ...))
scores.append(calinski_harabasz_score(.., ...))
scores.append(calinski_harabasz_score(..., ...))
  
# Plotting a Bar Graph to compare the models 
plt.bar(..., ...) 
plt.xlabel('Models') 
plt.ylabel('Calinski Harabasz Score')
plt.grid(alpha=0.5)
plt.title('Comparison of different Clustering Models')
plt.show() 
print(scores)