In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score, normalized_mutual_info_score
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score, v_measure_score, silhouette_score, davies_bouldin_score, mutual_info_score
from sklearn.manifold import TSNE
from sklearn.covariance import EmpiricalCovariance

from mpl_toolkits.mplot3d import Axes3D
from yellowbrick.cluster import KElbowVisualizer
from collections import Counter
from scipy.spatial.distance import pdist, squareform, mahalanobis
from numpy.linalg import inv, pinv
from kneed import KneeLocator
import warnings



# Task 1 EDA and Preprocessing

### Subtask 1: Data loading




In [None]:
########
# Subtask 1 - Data Loading
#
# Purpose: Load the dataset into a pandas dataframe and verify row and column counts
# Takeaway: The validated dataset will be loaded.
########
df = pd.read_csv("Dataset.csv") # load the dataset
rows, cols = df.shape # get the row and column counts
print(f"Dataset shape: {rows} rows, {cols} columns") 

# programmatic verification of the integrity of the dataset, throw an error if the row or column counts are not equal to 900 and 8 respectively
if rows != 900:
    assert False, "Row count != 900"
if cols != 8:
    assert False, "Column count != 8"

print("Dataset integrity verified")

### Subtask 2: Check for missing data

In [None]:
########
# Subtask 2 - Verify that there are no missing values.
# 
# Purpose: Check all columns and rows for missing values.
# Takeaway: There are no missing data points or values in the dataset.
########
no_missing = df.isnull().sum()
if no_missing.sum() > 0:
    assert False, "The dataset contains missing values!!!! FIX"
print("Good, No missing values")

### Subtask 3: Outlier visualisation


In [None]:
########
# Subtask 3 - Outlier visualisation
#
# Purpose: Generate boxplots for each numerical column so that manual inspection for outliers can be done.
# Takeaway: Boxplots for each numerical feature was generated. Most features have outliers.        
########

sns.set_palette('viridis') # set colour scheme

# Get numerical ds_feats from the dataset (only numerical features are considered for outlier detection)
num_features = df.select_dtypes(include=[np.number]).columns

plt.figure(figsize=(16, 10))
for i, ds_feat in enumerate(num_features):
    plt.subplot(3, 3, i+1)  # Adjust grid based on number of features
    sns.boxplot(y=df[ds_feat])
    plt.title(f'{ds_feat} Boxplot')
    plt.tight_layout()

plt.suptitle('Boxplots for Numerical Features to Identify Outliers', fontsize=16)
plt.subplots_adjust(top=1)
plt.show()


### Subtask 4: Distribution Visualisations

Seaborn has differing functions for histograms and KDE plots. Use these.

In [None]:
########
# Subtask 4: Distribution Visualisations
#
# Purpose: Generate histograms and KDS plots for each feature to visualise their distributions.
# Takeaway: The following features can be seen to be right skewed: Area, ConvexArea, MajorAxisLength, MinorAxisLength, Perimeter. 
#            The following features are left skewed: Eccentricity, Extent.
########


plt.figure(figsize=(16, 12))

# Histogram for each feature
for i, feature in enumerate(num_features):
    plt.subplot(3, 3, i+1)  
    sns.histplot(df[feature])
    plt.title(f'{feature} Histogram')
    plt.tight_layout()

plt.suptitle('Per feature histogram', fontsize=14)
plt.subplots_adjust(top=0.9)
plt.show()

plt.figure(figsize=(16, 12))

# KDE plot per feature
for i, feature in enumerate(num_features):
    plt.subplot(3, 3, i+1)  
    sns.kdeplot(df[feature], fill=True)
    plt.title(f'{feature} KDE Plot')
    plt.tight_layout()

plt.suptitle('Per Feature KDE Plot', fontsize=16)
plt.subplots_adjust(top=0.9)
plt.show()





Numeric features have a skew either to the left or the right.

### Subtask 5 - Statistical descriptors

In [None]:
########
# Subtask 5 - Statistical descriptors
#
# Purpose: Produce descriptive statistics for each numerical feature and comment on them..
# Takeaway: The descriptive statistics show that the following features are right skewed: Area, ConvexArea, MajorAxisLength, MinorAxisLength, Perimeter, as their mean is larger than the median.
#           The following features are left skewed: Eccentricity, Extent, as their mean is less than the median.
########

print("Descriptive statistics.")
display(df.describe())

# Calculate additional statistics that aren't in describe()
print("\nAdditional Statistics:")
numerical_stats = pd.DataFrame({
    'Median': df.select_dtypes(include=[np.number]).median(),
    'Skewness': df.select_dtypes(include=[np.number]).skew(),
    'Kurtosis': df.select_dtypes(include=[np.number]).kurt(),
    'IQR': df.select_dtypes(include=[np.number]).quantile(0.75) - df.select_dtypes(include=[np.number]).quantile(0.25),
    'Range': df.select_dtypes(include=[np.number]).max() - df.select_dtypes(include=[np.number]).min()
})
display(numerical_stats)


### Subtask 6 - Dataset Normalisation


In [None]:
########
# Subtask 6 - Dataset Normalisation
#
# Purpose: For every numeric feature, change the range of values to fit between 0 and 1 while preserving the shape of the distribution.
# Takeaway: All numeric features were normalised.
########
num_cols = df.select_dtypes(include=[np.number]).columns.tolist() # create list of numerical columns
minmax_scaler = MinMaxScaler()
df[num_cols] = minmax_scaler.fit_transform(df[num_cols]) # fit then transform the numerical columns

df.head()


# Task 2 - Inspect the Effect of Cluster Count

In [None]:
########
# Task 2 - Inspect the Effect of Cluster Count
#
# Purpose: To determine, using the elbow method, the optimal cluster number.
# Takeaway: 3 was to be determined the optimal cluster number, as this is where the elbow is.
########

model = KMeans()
visualiser = KElbowVisualizer(
    model, k=(1,11), metric='distortion', timings=False
) 

visualiser.fit(df[num_cols])        # Fit the data to the visualizer
visualiser.show() 



# Task 3 - Inspect Effects of Cluster Initialisation


In [None]:
##########
# Task 3 - Inspect Effects of Cluster Initialisation
#
# Purpose: Run KMeans and Kmeans++ clustering many times to investigate if rand initialisation has an effect on the clustering results.
# Takeaway: It was observed that Kmeans++ had a greater dispersion for its distribution of results for both inertia and silhouette scores. This indicates
#           that it is more affected by rand initialisation values than KMeans.
##########

loop_count = 50
cluster_count = 3  # Using 5 clusters based on the elbow method from previous task
vanilla_inertias = []
vanilla_silhouette_scores = []
fancy_kmeans_inertias = []
fancy_kmeans_silhouette_scores = []

for magic_number in range(loop_count): 

    # kmeans
    boring_kmeans = KMeans(n_clusters=cluster_count, init='random', random_state=magic_number)
    boring_kmeans.fit(df[num_cols])
    vanilla_inertias.append(boring_kmeans.inertia_)
    
    # kmeans++
    fancy_kmeans = KMeans(n_clusters=cluster_count, init='k-means++', random_state=magic_number)
    fancy_kmeans.fit(df[num_cols])
    fancy_kmeans_inertias.append(fancy_kmeans.inertia_)
    
    # calc silhouette score for kmeans++
    fancy_labels = fancy_kmeans.labels_
    fancy_kmeans_silhouette_scores.append(silhouette_score(df[num_cols], fancy_labels))

# calc average metrics
vanilla_inertia_avg = np.mean(vanilla_inertias)
vanilla_silhouette_avg = np.mean(vanilla_silhouette_scores)
fancy_inertia_avg = np.mean(fancy_kmeans_inertias)
fancy_silhouette_avg = np.mean(fancy_kmeans_silhouette_scores)

# calc standard deviations 
vanilla_inertia_wobble = np.std(vanilla_inertias)
vanilla_silhouette_wobble = np.std(vanilla_silhouette_scores)
fancy_inertia_wobble = np.std(fancy_kmeans_inertias)
fancy_silhouette_wobble = np.std(fancy_kmeans_silhouette_scores)

# Display results
print("Kmeans Initialisation:")
print(f"Average Inertia: {vanilla_inertia_avg:.2f} (±{vanilla_inertia_wobble:.2f})")
print(f"Average Silhouette Score: {vanilla_silhouette_avg:.4f} (±{vanilla_silhouette_wobble:.4f})")
print("\nKMeans++ Initialisation:")
print(f"Average Inertia: {fancy_inertia_avg:.2f} (±{fancy_inertia_wobble:.2f})")
print(f"Average Silhouette Score: {fancy_silhouette_avg:.4f} (±{fancy_silhouette_wobble:.4f})")

# Plot the distribution of inertias for both methods
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(vanilla_inertias, alpha=0.7, label='Random Init')
plt.hist(fancy_kmeans_inertias, alpha=0.7, label='KMeans++ Init')
plt.ylabel('Frequency')
plt.xlabel('Metric - Inertia')
plt.title('Dist of Inertia Values')
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(vanilla_silhouette_scores, alpha=0.7, label='Random Init')
plt.hist(fancy_kmeans_silhouette_scores, alpha=0.7, label='KMeans++ Init')
plt.ylabel('Frequency')
plt.xlabel('Metric - Silhouette Score')

plt.title('Dist of Silhouette Scores')
plt.legend()

plt.tight_layout()
plt.show()



# Task 4 - Investigate Various Cluster Evaluation Methods


In [None]:
#########
# Task 4 - Investigate Various Cluster Evaluation Methods
#
# Purpose: To investigate and evaluate various clustering evaluation methods and metrics.
# Takeaway: A purity score of 0.84 is not far off the highest score of 1 indicating that the points assigned to the cluster had good agreement with the ground truth labels.
#           A mutual information score of 0.3343 is not particularly high meaning that the clustering did not find the underlying structure of the data very well.
#           A silhouette score of 0.3372 is once again not particularly high meanings that the boundaries between the clusters is not very clear.
#########


# 3 was the optimal k from task 2
k = 3

# get labels and features
X = df.drop('label', axis=1)
true_labels = df['label']

# KMeans 
kmeans_model = KMeans(n_clusters=k, random_state=42, init='k-means++')
kmeans_cluster_labels = kmeans_model.fit_predict(X)

# based on majority class assign a label to each cluster
cluster_to_label = {}
for cluster_id in range(k):
    cluster_index = np.where(kmeans_cluster_labels == cluster_id)[0]
    cluster_true_labels = true_labels.iloc[cluster_index]
    # get the most common label
    common_label = Counter(cluster_true_labels).most_common(1)[0][0]
    cluster_to_label[cluster_id] = common_label

# print mappings
print("Cluster to Label Mapping:")
for cluster_id, label in cluster_to_label.items():
    print(f"Cluster {cluster_id} has label: {label}")

def purity_score(ground_truth, predicted_clusters):
    confusion_matrix = np.zeros((k, len(np.unique(ground_truth))))
    
    for idx in range(len(ground_truth)):
        actual_label_idx = np.where(np.unique(ground_truth) == ground_truth.iloc[idx])[0][0]
        confusion_matrix[predicted_clusters[idx], actual_label_idx] += 1
    
    return np.sum(np.max(confusion_matrix, axis=1)) / len(ground_truth)

# calc evaluation metrics
purity_metric = purity_score(true_labels, kmeans_cluster_labels)
mutual_information = normalized_mutual_info_score(true_labels, kmeans_cluster_labels)
silhouette_coefficient = silhouette_score(X, kmeans_cluster_labels)

print("\nEvaluation Metrics:")
print(f"Purity Score: {purity_metric:.4f}")
print(f"Normalized Mutual Information Score: {mutual_information:.4f}")
print(f"Silhouette Score: {silhouette_coefficient:.4f}")

# Graph clusters
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
for cluster_idx in range(k):
    points_in_cluster = X[kmeans_cluster_labels == cluster_idx]
    plt.scatter(points_in_cluster.iloc[:, 0], points_in_cluster.iloc[:, 1], 
                label=f"Cluster {cluster_idx}: {cluster_to_label[cluster_idx]}")
 
for class_label in np.unique(true_labels):
    points_with_label = X[true_labels == class_label]
    plt.scatter(points_with_label.iloc[:, 0], points_with_label.iloc[:, 1], label=class_label)

plt.title('True Labels')
plt.xlabel(X.columns[0])
plt.ylabel(X.columns[1])
plt.legend()

plt.tight_layout()
plt.show()


# Task 5 Investigate PCA and Variance Captured by Components

In [None]:
######
# Task 5 - Investigate PCA and Variance Captured by Components
#
# Purpose: To produce a new dataset using principal component analysis. The dimensionality will be reduced to 4 features.
# Takeaway: 4 components were produced and it was determined that 3 components were needed to keep 90% of the variance:
#           Component 1: 0.6903 
#           Component: 0.2076 
#           Component 3: 0.0898 98% of variance
#           Component 4: 0.0081 
######

X_std = df.drop('label', axis=1)

# Apply PCA with 4 components
pca_model = PCA(n_components=4)
X_transformed = pca_model.fit_transform(X_std)

pca_dataframe = pd.DataFrame(
    data=X_transformed,
    columns=['PC1', 'PC2', 'PC3', 'PC4']
)

pca_dataframe['label'] = true_labels



# Graph the variance
variance_explained = pca_model.explained_variance_ratio_
cumulative_var = np.cumsum(variance_explained)

plt.figure(figsize=(10, 6))
plt.bar(range(1, len(variance_explained) + 1), variance_explained, alpha=0.7, label='Individual variance')
plt.step(range(1, len(cumulative_var) + 1), cumulative_var, where='mid', label='Cumulative variance')
plt.axhline(y=0.9, color='r', linestyle='--', label='90% threshold')

# Calc how many components needed for 90% variance
needed_components = np.argmax(cumulative_var >= 0.9) + 1
plt.axvline(x=needed_components, color='g', linestyle='--', 
            label=f'{needed_components} components needed for 90% variance')

plt.xlabel('Principal Components')
plt.ylabel('Variance Ratio')
plt.title('Variance by Principal Components')
plt.legend()
plt.grid(True)
plt.show()

print(f"\nVariance by component:")
for i, var in enumerate(variance_explained):
    print(f"PC{i+1}: {var:.4f} ({cumulative_var[i]:.4f} cumulative)")

print(f"\nNumber of components needed to keep 90% variance: {needed_components}")

# 3d plot of first 3 components
fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

unique_labels = np.unique(true_labels)
colours = plt.cm.rainbow(np.linspace(0, 1, len(unique_labels)))

for label, colour in zip(unique_labels, colours):
    mask = pca_dataframe['label'] == label
    ax.scatter(
        pca_dataframe.loc[mask, 'PC1'],
        pca_dataframe.loc[mask, 'PC2'],
        pca_dataframe.loc[mask, 'PC3'],
        label=label,
        color=colour,
        alpha=0.7,
        s=50
    )


ax.set_title('First 3 Principal Components')
ax.set_xlabel('Component 1')
ax.set_ylabel('Component 2')
ax.set_zlabel('Component 3')
ax.legend()

plt.tight_layout()
plt.show()


# Task 6 - Investigate DBSCAN Utilising Various Distance Metrics

In [None]:

#######
# Task 6 - Investigate DBSCAN Utilising Various Distance Metrics
#
# Purpose: To investigate the impact utilising various distance metrics has on the DBSCAN clustering method.
# Takeaway: This was an interesting task in that it allowed us to investigate the behaviour of the DBSCAN algorithm.
#           DBSCAN is an algorithm were the cluster numbers cannot be predetermined. It also assigns outliers to a cluster label of -1.
#            The optimal params for the euclidean distance were eps=0.65, min_samples=5. The optimal params for the mahalanobis distance  were eps=7.00, min_samples=5.
#            The euclidean distance had a slightly better mutual information and purity score indicating that it was more aligned to the ground truth labels.
#            The silhouette score for mahalanobis distance was higher suggesting that it discovered a more compact cluster structure.
#######

scaler = MinMaxScaler()
X_std = scaler.fit_transform(df[num_cols]) # fit then transform the numerical columns

# Calculate purity - how well our clusters match the true labels
def calc_purity(true_labels, predicted_labels):
    contingency_matrix = np.zeros((len(np.unique(true_labels)), len(np.unique(predicted_labels))))
    
    for i, true_label in enumerate(np.unique(true_labels)):
        for j, pred_label in enumerate(np.unique(predicted_labels)):
            contingency_matrix[i, j] = np.sum((true_labels == true_label) & (predicted_labels == pred_label))
    
    cluster_sizes = np.sum(contingency_matrix, axis=0)
    max_correct = np.sum(np.max(contingency_matrix, axis=0))
    
    purity = max_correct / len(true_labels)
    return purity



# find the best eps value for euclidean distance
def find_best_eps(X, min_samples, target_clusters=2, eps_range=np.arange(0.1, 2.0, 0.05)):
    results = []
    
    for eps in eps_range:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(X)
        num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        outliers = np.sum(labels == -1)
        
        results.append({
            'eps': eps,
            'num_clusters': num_clusters,
            'outliers': outliers,
            'labels': labels
        })
    
    return min(results, key=lambda x: abs(x['num_clusters'] - target_clusters))

# let's try different min_samples values
min_samples_options = [5, 10, 15, 20, 25, 30]
euclidean_results = []

for min_samples in min_samples_options:
    result = find_best_eps(X_std, min_samples)
    result['min_samples'] = min_samples
    euclidean_results.append(result)
    print(f"Euclidean - min_samples={min_samples}, eps={result['eps']:.2f}, clusters={result['num_clusters']}, outliers={result['outliers']}")

# pick the best eps
best_euclidean = min(euclidean_results, key=lambda x: abs(x['num_clusters'] - 2))
print(f"\nBest Euclidean parameters: eps={best_euclidean['eps']:.2f}, min_samples={best_euclidean['min_samples']}")

# calculate distances for mahalanobis
cov = np.cov(X_std.T)
inv_cov = np.linalg.inv(cov)

def create_mahalanobis_dist_matrix(X, inv_cov):
    n = X.shape[0]
    dist_matrix = np.zeros((n, n))
    
    for i in range(n):
        for j in range(i, n):
            dist = mahalanobis(X[i], X[j], inv_cov)
            dist_matrix[i, j] = dist
            dist_matrix[j, i] = dist
    
    return dist_matrix

# create the Mahalanobis distance matrix
mahalanobis_dist_matrix = create_mahalanobis_dist_matrix(X_std, inv_cov)

# find the best eps for Mahalanobis distance
def find_best_eps_precomputed(dist_matrix, min_samples, target_clusters=2, eps_range=np.arange(0.5, 10.0, 0.5)):
    results = []
    
    for eps in eps_range:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
        labels = dbscan.fit_predict(dist_matrix)
        num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        outliers = np.sum(labels == -1)
        
        results.append({
            'eps': eps,
            'num_clusters': num_clusters,
            'outliers': outliers,
            'labels': labels
        })
    
    return min(results, key=lambda x: abs(x['num_clusters'] - target_clusters))

mahalanobis_results = []

for min_samples in min_samples_options:
    result = find_best_eps_precomputed(mahalanobis_dist_matrix, min_samples)
    result['min_samples'] = min_samples
    mahalanobis_results.append(result)
    print(f"Mahalanobis - min_samples={min_samples}, eps={result['eps']:.2f}, clusters={result['num_clusters']}, outliers={result['outliers']}")

# find best parameters for mahalanobis
best_mahalanobis = min(mahalanobis_results, key=lambda x: abs(x['num_clusters'] - 2))
print(f"\nBest Mahalanobis parameters: eps={best_mahalanobis['eps']:.2f}, min_samples={best_mahalanobis['min_samples']}")

# run dbscan with best parameters for euclidean
euclidean_dbscan = DBSCAN(eps=best_euclidean['eps'], min_samples=best_euclidean['min_samples'])
euclidean_labels = euclidean_dbscan.fit_predict(X_std)

# run dbscan with best parameters for mahalanobis
mahalanobis_dbscan = DBSCAN(eps=best_mahalanobis['eps'], min_samples=best_mahalanobis['min_samples'], metric='precomputed')
mahalanobis_labels = mahalanobis_dbscan.fit_predict(mahalanobis_dist_matrix)

# calculate metrics for Euclidean distance
euclidean_purity = calc_purity(true_labels, euclidean_labels)
euclidean_mi = mutual_info_score(true_labels, euclidean_labels)

# filter out noise cluster (-1 label)
euclidean_filtered_idx = euclidean_labels != -1
euclidean_filtered_labels = euclidean_labels[euclidean_filtered_idx]
true_labels_euclidean_filtered = true_labels[euclidean_filtered_idx]

# calculate silhouette score
if len(set(euclidean_filtered_labels)) > 1:
    euclidean_silhouette = silhouette_score(X_std[euclidean_filtered_idx], euclidean_filtered_labels)
else:
    euclidean_silhouette = "N/A"  # Not applicable if only one cluster remains

# calculate metrics for Mahalanobis distance
mahalanobis_purity = calc_purity(true_labels, mahalanobis_labels)
mahalanobis_mi = mutual_info_score(true_labels, mahalanobis_labels)

# filter out noise cluster (-1 label)
mahalanobis_filtered_idx = mahalanobis_labels != -1
mahalanobis_filtered_labels = mahalanobis_labels[mahalanobis_filtered_idx]
true_labels_mahalanobis_filtered = true_labels[mahalanobis_filtered_idx]

if len(set(mahalanobis_filtered_labels)) > 1:
    mahalanobis_silhouette = silhouette_score(X_std[mahalanobis_filtered_idx], mahalanobis_filtered_labels)
else:
    mahalanobis_silhouette = "N/A"  # Not applicable if only one cluster remains

euclidean_cluster_count = len(set(euclidean_labels))
mahalanobis_cluster_count = len(set(mahalanobis_labels))

print("\nMetrics results:")
print(f"Euclidean - Purity: {euclidean_purity:.4f}, Mutual Information: {euclidean_mi:.4f}, Silhouette: {euclidean_silhouette}, Cluster Count: {euclidean_cluster_count}")
print(f"Mahalanobis - Purity: {mahalanobis_purity:.4f}, Mutual Information: {mahalanobis_mi:.4f}, Silhouette: {mahalanobis_silhouette}, Cluster Count: {mahalanobis_cluster_count}")

if len(set(euclidean_filtered_labels)) > 1:
    euclidean_silhouette = silhouette_score(X_std[euclidean_filtered_idx], euclidean_filtered_labels)
else:
    euclidean_silhouette = "N/A"  # Not applicable if only one cluster remains

# calculate metrics for Mahalanobis distance
mahalanobis_purity = calc_purity(true_labels, mahalanobis_labels)
mahalanobis_mi = mutual_info_score(true_labels, mahalanobis_labels)

# filter out noise cluster (-1 label)
mahalanobis_filtered_idx = mahalanobis_labels != -1
mahalanobis_filtered_labels = mahalanobis_labels[mahalanobis_filtered_idx]
true_labels_mahalanobis_filtered = true_labels[mahalanobis_filtered_idx]

if len(set(mahalanobis_filtered_labels)) > 1:
    mahalanobis_silhouette = silhouette_score(X_std[mahalanobis_filtered_idx], mahalanobis_filtered_labels)
else:
    mahalanobis_silhouette = "N/A"  # Not applicable if only one cluster remains

euclidean_cluster_count = len(set(euclidean_labels))
mahalanobis_cluster_count = len(set(mahalanobis_labels))

print("\nMetrics results:")
print(f"Euclidean - Purity: {euclidean_purity:.4f}, Mutual Information: {euclidean_mi:.4f}, Silhouette: {euclidean_silhouette}, Cluster Count: {euclidean_cluster_count}")
print(f"Mahalanobis - Purity: {mahalanobis_purity:.4f}, Mutual Information: {mahalanobis_mi:.4f}, Silhouette: {mahalanobis_silhouette}, Cluster Count: {mahalanobis_cluster_count}")


# Task 7 - Comparative Performance of Original vs Dimensionality Reduced Datasets

In [None]:
#########
# Task 7 - Comparative Performance of Original vs Dimensionality Reduced Datasets
#
# Purpose: Compare the performance of the original and PCA-transformed datasets using KMeans clustering.
# Takeaway: After KMeans clustering was performed on the original and PCA-transformed datasets, the results were:
#           PCA: Silhouette=0.484
#           Original: Silhouette=0.441  
#           The Principal Component Analysis dataset produced a higher silhouette score via KMeans clustering, suggesting that it has discovered a better defined cluster structure.
#########

X_pca = pca_dataframe.drop(columns='label', axis=1)

# function to run kmeans and eval silhouette and inertia score
def run_kmeans_and_check_quality(data_points, dataset_name, n_clusters_range=range(2, 11)):
    findings = []
    
    for k in n_clusters_range:
        # Apply KMeans
        kmeans_algo = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans_groupings = kmeans_algo.fit_predict(data_points)
        
        # Calculate silhouette score
        silhouette_val = silhouette_score(data_points, kmeans_groupings)
        
        # Calculate inertia (sum of squared distances to closest centroid)
        wobbliness = kmeans_algo.inertia_
        
        findings.append({
            'k': k,
            'silhouette': silhouette_val,
            'inertia': wobbliness,
            'labels': kmeans_groupings
        })
        
        print(f"{dataset_name} with {k} clusters - Silhouette Score: {silhouette_val:.3f}, Inertia: {wobbliness:.2f}")
    
    return findings

# original dataset
raw_data_results = run_kmeans_and_check_quality(X_std, "Original Dataset")

# pca dataset
pca_data_results = run_kmeans_and_check_quality(X_pca, "PCA Dataset")

# Plot results
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot([r['k'] for r in raw_data_results], [r['silhouette'] for r in raw_data_results], 'o-', label='Original Dataset')
plt.plot([r['k'] for r in pca_data_results], [r['silhouette'] for r in pca_data_results], 'o-', label='PCA Dataset')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette')
plt.title('Silhouette vs. Cluster Number')
plt.legend()
plt.grid(True)


# find the best on silhouette score
best_k_raw = max(raw_data_results, key=lambda x: x['silhouette'])
best_k_pca = max(pca_data_results, key=lambda x: x['silhouette'])

print("\nBest results:")
print(f"Original Dataset: k={best_k_raw['k']}, Silhouette Score={best_k_raw['silhouette']:.3f}")
print(f"PCA Dataset: k={best_k_pca['k']}, Silhouette Score={best_k_pca['silhouette']:.3f}")

# determine which is better
if best_k_pca['silhouette'] > best_k_raw['silhouette']:
    print("\nPCA dataset is better, innit.")
else:
    print("\nOriginal dataset is better, mate.")



# Task 8 - Dimensionality Reduction Using t-SNE

In [None]:
#########
# Task 8 - Clustering Using t-SNE
#
# Purpose: Create a new dataset with only 4 features using t-SNE, then compare its performance using KMeans clustering.
# Takeaway: A new dataset was created with only 4 features using t-SNE. After Kmeans clustering was performed, the dataset performance was::
#           t-SNE Dataset: Silhouette=0.400
#           Original Dataset:  Silhouette=0.441
#           PCA Dataset: Silhouette=0.484
#           The t-SNA dataset had the lowest silhouette score suggesting that Kmeans clustering couldnt find a good cluster structure.
#########


# compute t-SNE 
t_sne = TSNE(n_components=4, method='exact', random_state=42)
X_t_sne = t_sne.fit_transform(X_std)

fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

if 'label' in df.columns:
    labels = df['label'].unique()
    for i, label in enumerate(labels):
        indices = df['label'] == label
        ax.scatter(X_t_sne[indices, 0], X_t_sne[indices, 1], X_t_sne[indices, 2], 
                   label=label, alpha=0.7)
else:
    ax.scatter(X_t_sne[:, 0], X_t_sne[:, 1], X_t_sne[:, 2], alpha=0.7)

ax.set_title('3D t-SNE Visualisation')
ax.set_xlabel('Comp. 1')
ax.set_ylabel('Comp. 2')
ax.set_zlabel('Comp. 3')
plt.legend()
plt.tight_layout()
plt.show()


distortions_list = []
silhouette_vals = []
K_range = range(2, 11)

for k in K_range:
    kmeans_model = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans_model.fit(X_t_sne)
    distortions_list.append(kmeans_model.inertia_)
    
    kmeans_cluster_labels = kmeans_model.labels_
    silhouette_avg = silhouette_score(X_t_sne, kmeans_cluster_labels)
    silhouette_vals.append(silhouette_avg)


# calc scores
t_sne_results = []
for k in K_range:
    kmeans_model = KMeans(n_clusters=k, random_state=42, n_init=10)
    tsne_cluster_labels = kmeans_model.fit_predict(X_t_sne)
    tsne_silhouette_avg = silhouette_score(X_t_sne, tsne_cluster_labels)
    t_sne_results.append({'k': k, 'silhouette': tsne_silhouette_avg})

best_k_t_sne = max(t_sne_results, key=lambda x: x['silhouette'])


print("\nResults:")
print(f"Original Dataset: k={best_k_raw['k']}, Silhouette Score={best_k_raw['silhouette']:.3f}")
print(f"PCA Dataset: k={best_k_pca['k']}, Silhouette Score={best_k_pca['silhouette']:.3f}")
print(f"t-SNE Dataset: k={best_k_t_sne['k']}, Silhouette Score={best_k_t_sne['silhouette']:.3f}")

# find the best
best_method = max([
    ('Original', best_k_raw['silhouette']),
    ('PCA', best_k_pca['silhouette']),
    ('t-SNE', best_k_t_sne['silhouette'])
], key=lambda x: x[1])

print(f"\nThe {best_method[0]} dataset provides better clustering quality with a silhouette score of {best_method[1]:.3f}, innit.")
