Reading the ARFF file.

In [None]:
from scipy.io.arff import loadarff
import pandas as pd, numpy as np

data = loadarff('column_diagnosis.arff')
df = pd.DataFrame(data[0])
df['class'] = df['class'].str.decode('utf-8')


Separate input from output data and normalize the features using sklearn's minmax scaler.

In [None]:
from sklearn.preprocessing import MinMaxScaler

features = df.drop('class', axis=1)
target = df['class']

scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

**Exercise 1**

Define purity_score and create 2 arrays, 1 to store purity values and 1 to store silhouette values. In a for loop, create kmeans algorithms with k={2, 3, 4, 5}, train them and calculate purity and silhouette values. At the end, store them in the respective array. 

In [None]:
from sklearn import metrics, cluster

def purity_score(y_true, y_pred):
    # compute contingency/confusion matrix
    confusion_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    return np.sum(np.amax(confusion_matrix, axis=0)) / np.sum(confusion_matrix) 

silhouettes = []
purities = []

for k in range(2, 6):
    kmeans_algo = cluster.KMeans(n_clusters=k, random_state=0 , n_init= 'auto')
    kmeans_model = kmeans_algo.fit(features_scaled)
    target_pred = kmeans_model.labels_
    purity = purity_score(target, target_pred)
    silhouette = metrics.silhouette_score(features_scaled, target_pred)
    silhouettes.append(silhouette)
    purities.append(purity)
    
    print("Purity score for k = " , str(k) , " is " , purity)
    print("Silhouette score for k = " , str(k) , " is " , silhouette)

Plot the silhouette and purity.

In [None]:
import matplotlib.pyplot as plt

#Plot Silhouette
plt.plot([2,3,4,5], silhouettes, 'o-')
plt.title('Silhouette scores for k-means clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.savefig('ex1_silhouette.png')
plt.show()

#Plot Purity
plt.plot([2,3,4,5], purities, 'o-')
plt.title('Purity scores for k-means clustering')
plt.xlabel('Number of clusters')
plt.ylabel('Purity score')
plt.savefig('ex1_purity.png')
plt.show()

**Exercise 2**

**i)**

Create a PCA object, fit the PCA model to the scaled features and transform the original data into a new dataset (X_pca) with only two dimensions. 

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(features_scaled)
X_pca = pca.transform(features_scaled)

print("Components (eigenvectors):\n",pca.components_)
print("Explained variance (eigenvalues) =",pca.explained_variance_)
print("Explained variance (ratio) =",pca.explained_variance_ratio_)

**ii)**

Extract the eigenvectors of the first and second principal components. These vectors represent the directions of maximum variance in the new feature space. Calculate the importance of each feature in the new feature space. This importance is determined by taking the Euclidean norm of the coefficients of the feature in the eigenvectors. 

In [None]:
import math

xvector = pca.components_[0] 
yvector = pca.components_[1]

columns = features.columns
impt_features = {columns[i] : math.sqrt(xvector[i]**2 + yvector[i]**2) for i in range(len(columns))}
print("Features by importance:\n", sorted(zip(impt_features.values(),impt_features.keys()),reverse=True))

**Exercise 3**

**i)**

Visualize side-by-side the data using the ground diagnoses.

In [None]:
plt.figure(figsize=(12,10))
plt.plot(X_pca[target=='Normal', 0], X_pca[target=='Normal', 1], 'o', markersize=7, alpha=0.6, label='Normal')
plt.plot(X_pca[target=='Hernia', 0], X_pca[target=='Hernia', 1], 'o', markersize=7, alpha=0.6, label='Hernia')
plt.plot(X_pca[target=='Spondylolisthesis', 0], X_pca[target=='Spondylolisthesis', 1], 'o', markersize=7, alpha=0.6, label='Spondylolisthesis')

plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.title('Ground diagnosis')
plt.show()

**ii)**

Visualize side-by-side the data using the previously learned k = 3 clustering solution.

In [None]:
kmeans_algo = cluster.KMeans(n_clusters=3, random_state=0 , n_init= 'auto')
kmeans_model = kmeans_algo.fit(features_scaled)
target_pred = kmeans_model.labels_

plt.figure(figsize=(12, 10))
plt.scatter(X_pca[:,0], X_pca[:,1], c=target_pred, alpha=0.6)

plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.title('k = 3 clustering solution')
plt.show()

**Extra/Optional Plot**

Combine the cluster labels with the original class labels into a DataFrame. Calculate mode class for each cluster and create the scatter plot.

In [None]:
cluster_mapping = pd.DataFrame({'Cluster': target_pred, 'Class': target})

# Calculate the mode class for each cluster
cluster_mode = cluster_mapping.groupby('Cluster')['Class'].agg(lambda x: x.mode().iat[0])

for cluster in set(target_pred):
    data = X_pca[target_pred == cluster]
    plt.scatter(data[:, 0], data[:, 1], label=f'Cluster {cluster}', alpha=0.6)

plt.title('K-means Clustering')

# Create a legend using the calculated mode class for each cluster
legend_labels = [f'Cluster {cluster}: {mode_class}' for cluster, mode_class in cluster_mode.items()]
plt.legend(legend_labels)

# Show the plot
plt.show()