In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans



# Task 1 Data Preprocessing and Exploratory Data Analysis

We perform the following steps:
1. Load the dataset ("Dataset.csv") and verify its integrity.
2. Confirm that there are no missing values.
3. Identify and analyze outliers using visualizations such as boxplots.
4. Visualize feature distributions with histograms and KDE plots to understand the
overall distribution of each feature.
5. Review feature statistics (e.g., mean, standard deviation) to get insights into the
data.
6. Normalize or standardize the dataset so that all features contribute equally in
distance calculations, which is crucial for clustering.

### Subtask 1: Load the dataset ("Dataset.csv") and verify its integrity.

Manual inspection of the dataset determined that there are 900 rows (excluding the header row) and 8 columns. There to satisfy the integrity requirement we take that to mean the row and column counts are equal after the dataframe is loaded.







In [None]:
df = pd.read_csv("Dataset.csv") # load the dataset
rows, cols = df.shape # get the row and column counts
print(f"Dataset shape: {rows} rows, {cols} columns") 

# programmatic verification of the integrity of the dataset, throw an error if the row or column counts are not equal to 900 and 8 respectively
if rows != 900:
    assert False, "The number of rows in the dataset is not equal to 900"
if cols != 8:
    assert False, "The number of columns in the dataset is not equal to 8"

print("Dataset integrity verified")

### Subtask 2: Confirm that there are no missing values.
Count the number of missing values in each column and throw an error if any are found.

In [None]:
missing_values_count = df.isnull().sum()
if missing_values_count.sum() > 0:
    assert False, "The dataset contains missing values!!!! FIX"
print("Good, No missing values")

### Subtask 3: Identify and analyze outliers using visualizations such as boxplots.
Boxplots for each numerical feature to identify and analyze outliers. Calculate and display statistics about potential outliers. This can be done by calculating the IQR and then using that to identify the lower and upper bounds of the outliers.
The label is categorical so not included in outlier detection.


In [None]:

sns.set_palette('viridis') # set colour scheme

# Get numerical features from the dataset
numerical_features = df.select_dtypes(include=[np.number]).columns

# Create boxplots for each numerical feature
plt.figure(figsize=(16, 10))
for i, feature in enumerate(numerical_features):
    plt.subplot(3, 3, i+1)  # Adjust grid based on number of features
    sns.boxplot(y=df[feature])
    plt.title(f'Boxplot of {feature}')
    plt.tight_layout()

plt.suptitle('Boxplots for Numerical Features to Identify Outliers', fontsize=16)
plt.subplots_adjust(top=0.9)
plt.show()

# Calculate and display statistics about outliers
print("Potential outliers analysis:")
for feature in numerical_features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)

    # A standard way to detect outliers is to use the IQR (Interquartile Range) then outliers are any values that fall outside of 1.5 times the IQR below Q1 or above Q3
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)][feature]
    print(f"{feature}: {len(outliers)} outliers detected")
    if len(outliers) > 0:
        print(f"  - Min Boundary: {outliers.min():.2f}, Max Boundary {outliers.max():.2f}]")


### Subtask 4: Visualise feature distributions with histograms and KDE plots to understand the overall distribution of each feature.

Seaborn has differing functions for histograms and KDE plots. Use these.

In [None]:

plt.figure(figsize=(16, 12))


numerical_features = df.select_dtypes(include=[np.number]).columns # list of numerical features

# Display histograms for each numerical feature
for i, feature in enumerate(numerical_features):
    plt.subplot(3, 3, i+1)  # Adjust grid based on number of features
    sns.histplot(df[feature])
    plt.title(f'Histogram of {feature}')
    plt.tight_layout()

plt.suptitle('Feature Distributions with Histograms', fontsize=16)
plt.subplots_adjust(top=0.9)
plt.show()

# Create KDE plots for each numerical feature
plt.figure(figsize=(16, 12))

# Get numerical features from the dataset
numerical_features = df.select_dtypes(include=[np.number]).columns

# Create KDE plots for each numerical feature
for i, feature in enumerate(numerical_features):
    plt.subplot(3, 3, i+1)  # Adjust grid based on number of features
    sns.kdeplot(df[feature], fill=True)
    plt.title(f'KDE Plot of {feature}')
    plt.tight_layout()

plt.suptitle('Feature Distributions with KDE Plots', fontsize=16)
plt.subplots_adjust(top=0.9)
plt.show()





All features are skewed to either the left or right

### Subtask 5 - Review feature statistics (e.g., mean, standard deviation) to get insights into the data.

In [None]:
print("Basic Statistics for Numerical Features via Pandas Dataframe describe:")
display(df.describe())

# Calculate additional statistics that aren't in describe()
print("\nAdditional Statistics:")
numerical_stats = pd.DataFrame({
    'Median': df.select_dtypes(include=[np.number]).median(),
    'Skewness': df.select_dtypes(include=[np.number]).skew(),
    'Kurtosis': df.select_dtypes(include=[np.number]).kurt(),
    'IQR': df.select_dtypes(include=[np.number]).quantile(0.75) - df.select_dtypes(include=[np.number]).quantile(0.25),
    'Range': df.select_dtypes(include=[np.number]).max() - df.select_dtypes(include=[np.number]).min()
})
display(numerical_stats)

# Generate a correlation matrix
print("\nCorrelation Matrix:")
correlation_matrix = df.select_dtypes(include=[np.number]).corr()
display(correlation_matrix)

# Plot the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()


It can be seen that lengths and areas are highly correlated, which is expected as area is a function of length.

### Subtask 6 - Normalize or standardize the dataset so that all features contribute equally in distance calculations, which is crucial for clustering.

For every numeric feature, we will normalize it to a range of 0 to 1.

In [None]:

numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist() # create list of numerical columns
scaler = MinMaxScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns]) # fit then transform the numerical columns

df.head()


# Task 2 - Impact of the Number of Clusters on KMeans Clustering with Euclidean Distance

The subtask for this are:
1. Apply KMeans clustering (using Euclidean distance) on the standardized dataset.
2. For a range of cluster numbers (e.g., from 1 to 10), compute the inertia (SSE) and plot
these values to identify the “elbow” point.

In [None]:

inertia_values = []
cluster_range = range(1, 11) 

# Try kmeans on 1 to 10 clusters and store the inertia values for each
for k in cluster_range:
    kmeans_model = KMeans(n_clusters=k, random_state=42)
    kmeans_model.fit(df[numerical_columns])
    inertia_values.append(kmeans_model.inertia_)

# plot the results
plt.figure(figsize=(10, 6))
plt.plot(cluster_range, inertia_values, 'bo-')
plt.xlabel('Cluster Count)
plt.ylabel('Inertia')
plt.title('Kmeans Inertia for different cluster counts')
plt.grid(True)
plt.xticks(cluster_range)
plt.show()



From the above plot, the elbow appears to be when the cluster number is 5 as after that point the inertia decreases at a slower rate than for lower cluster numbers.

# Task 3 - Evaluating the Stability of KMeans and KMeans++ Initialization

Subtasks are:
1. Run KMeans clustering 50 times using two initialization methods:
    - Standard random initialization.
    - KMeans++ initialization.
2. Compute and compare the average inertia (SSE) and the Silhouette Score for each
method over these iterations.

In [None]:
from sklearn.metrics import silhouette_score
import numpy as np

# Set parameters
n_iterations = 50
n_clusters = 5  # Using 5 clusters based on the elbow method from previous task
random_inertias = []
random_silhouette_scores = []
kmeans_plus_inertias = []
kmeans_plus_silhouette_scores = []

# Run standard random initialisation 50 times
for i in range(n_iterations):
    # Standard random initialisation
    kmeans_random = KMeans(n_clusters=n_clusters, init='random', random_state=i)
    kmeans_random.fit(df[numerical_columns])
    random_inertias.append(kmeans_random.inertia_)
    
    # Calculate silhouette score for random init
    labels_random = kmeans_random.labels_
    random_silhouette_scores.append(silhouette_score(df[numerical_columns], labels_random))
    
    # KMeans++ initialisation
    kmeans_plus = KMeans(n_clusters=n_clusters, init='k-means++', random_state=i)
    kmeans_plus.fit(df[numerical_columns])
    kmeans_plus_inertias.append(kmeans_plus.inertia_)
    
    # Calculate silhouette score for kmeans++
    labels_plus = kmeans_plus.labels_
    kmeans_plus_silhouette_scores.append(silhouette_score(df[numerical_columns], labels_plus))

# Calculate average metrics
avg_random_inertia = np.mean(random_inertias)
avg_random_silhouette = np.mean(random_silhouette_scores)
avg_kmeans_plus_inertia = np.mean(kmeans_plus_inertias)
avg_kmeans_plus_silhouette = np.mean(kmeans_plus_silhouette_scores)

# Calculate standard deviations to assess stability
std_random_inertia = np.std(random_inertias)
std_random_silhouette = np.std(random_silhouette_scores)
std_kmeans_plus_inertia = np.std(kmeans_plus_inertias)
std_kmeans_plus_silhouette = np.std(kmeans_plus_silhouette_scores)

# Display results
print("Standard Random Initialisation:")
print(f"Average Inertia: {avg_random_inertia:.2f} (±{std_random_inertia:.2f})")
print(f"Average Silhouette Score: {avg_random_silhouette:.4f} (±{std_random_silhouette:.4f})")
print("\nKMeans++ Initialisation:")
print(f"Average Inertia: {avg_kmeans_plus_inertia:.2f} (±{std_kmeans_plus_inertia:.2f})")
print(f"Average Silhouette Score: {avg_kmeans_plus_silhouette:.4f} (±{std_kmeans_plus_silhouette:.4f})")

# Plot the distribution of inertias for both methods
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(random_inertias, alpha=0.7, label='Random Init')
plt.hist(kmeans_plus_inertias, alpha=0.7, label='KMeans++ Init')
plt.xlabel('Inertia')
plt.ylabel('Frequency')
plt.title('Distribution of Inertia Values')
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(random_silhouette_scores, alpha=0.7, label='Random Init')
plt.hist(kmeans_plus_silhouette_scores, alpha=0.7, label='KMeans++ Init')
plt.xlabel('Silhouette Score')
plt.ylabel('Frequency')
plt.title('Distribution of Silhouette Scores')
plt.legend()

plt.tight_layout()
plt.show()
