In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score
from sklearn.cluster import KMeans, OPTICS
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.cluster import Birch
from tslearn.clustering import KShape
from sklearn.preprocessing import StandardScaler



from kneed import KneeLocator
from sklearn.mixture import GaussianMixture


# Task 6.2 Investigation of Microclimate Sensors Data



Load the data and display the schema

In [None]:


df = pd.read_csv('microclimate-sensors-data.csv')

df.info()

Most of the features do not agree on the non-null row count so preprocess is required to impute missing values and restrict the dataset to the number of non-null rows of the SensorLocation target feature

In [None]:
print(f"Number of non-null values in SensorLocation: {df['SensorLocation'].count()}")

print("\nMissing values in each column:")
print(df.isnull().sum())

# Restrict to non-null SensorLocation
df_clean = df.dropna(subset=['SensorLocation', 'LatLong'])
print(f"\nShape after restricting to non-null SensorLocation: {df_clean.shape}")

# Split LatLong into separate Latitude and Longitude columns
df_clean[['Latitude', 'Longitude']] = df_clean['LatLong'].str.split(',', expand=True).astype(float)
df_clean.drop(columns=['LatLong'], inplace=True)


numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df_clean.select_dtypes(include=['object']).columns.drop('SensorLocation') if 'SensorLocation' in df_clean.columns else df_clean.select_dtypes(include=['object']).columns

# Impute missing values
for col in numeric_cols:
    if df_clean[col].isnull().sum() > 0:
        mean_val = df_clean[col].mean()
        df_clean[col].fillna(mean_val, inplace=True)

print("\nMissing values after imputation:")
print(df_clean.isnull().sum())


All missing values have been taken care of

In [None]:
df_clean.info()

In [None]:
df_clean.head()

### Evaluation Metrics Function

Here we define the evaluation metrics


In [6]:

def evaluate_clustering(true_labels, pred_labels, algorithm_name):

    
    print(f"Clustering Evaluation Results for {algorithm_name}:")
    
    # ARI
    ari = adjusted_rand_score(true_labels, pred_labels)
    print(f"Adjusted Rand Index (ARI): {ari:.4f}")
    
    # NMI
    nmi = normalized_mutual_info_score(true_labels, pred_labels)
    print(f"Normalised Mutual Information (NMI): {nmi:.4f}")
    
    n_clusters = len(set(pred_labels)) - (1 if -1 in pred_labels else 0)

    
    # Calculate purity score
    contingency_matrix = metrics.cluster.contingency_matrix(true_labels, pred_labels)
    purity = np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)
    print(f"Purity Score: {purity:.4f}")
    
    # For DBSCAN-like: print number of clusters and noise points
    if -1 in pred_labels:
        noise_points = list(pred_labels).count(-1)
        print(f"Number of clusters (excluding noise): {n_clusters}")
        print(f"Noise points: {noise_points} ({noise_points/len(pred_labels)*100:.2f}%)")
    else:
        print(f"Number of clusters: {n_clusters}")
    print()

    # dictionary of results
    metrics_dict = {
        'algorithm': algorithm_name,
        'ari': ari,
        'nmi': nmi,
        'purity': purity,
        'n_clusters': n_clusters
    }
    
    if -1 in pred_labels:
        noise_points = list(pred_labels).count(-1)
        metrics_dict['noise_points'] = noise_points
        metrics_dict['noise_percentage'] = noise_points/len(pred_labels)*100
            
    return metrics_dict


### Perform Kmeans Clustering Function

We are going to run kmeans a few times so create a function to do this, evaludate and plot results

In [7]:
def perform_kmeans_clustering(input_dataframe, n_clusters, algorithm_name, visualise=False, plot_title=None):

    
    # Get numeric columns only
    numeric_cols = input_dataframe.select_dtypes(include=['float64', 'int64']).columns
    
    # Initialize and fit KMeans on numeric columns only
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(input_dataframe[numeric_cols])
    
    # Get true labels from SensorLocation
    true_labels = input_dataframe.index.get_level_values('SensorLocation') if isinstance(input_dataframe.index, pd.MultiIndex) else input_dataframe['SensorLocation']
    
    # Evaluate clustering
    print(f"\nKMeans Clustering Results (k={n_clusters}):")
    print("-" * 40)
    eval_results = evaluate_clustering(true_labels, cluster_labels, algorithm_name)
    
    # Visualize clusters if requested
    if visualise:
        unique_clusters = np.unique(cluster_labels)
        plt.figure(figsize=(12, 6))
        
        # Create scatter plot with different colors for each cluster
        scatter = plt.scatter(input_dataframe['Latitude'], input_dataframe['Longitude'], 
                             c=cluster_labels, cmap='viridis', s=50, alpha=0.8)
        
        # Add legend instead of colorbar
        legend_elements = [plt.Line2D([0], [0], marker='o', color='w', 
                          markerfacecolor=scatter.cmap(scatter.norm(i)), 
                          markersize=10, label=f'Cluster {i}') 
                          for i in unique_clusters]
        
        plt.legend(handles=legend_elements, title='Clusters', loc='best')
        plt.title(plot_title)
        plt.xlabel('Latitude')
        plt.ylabel('Longitude')
        plt.tight_layout()
        plt.show()
    
    return eval_results


## Item 1 Optimal Number of Groups

Here we aim to answer what is the optimal number of groups and what effect dimensionality reduction has on clustering.

### Item 1a - Unique Number of Target Classes

Since we have the ground truth values in a categorical value already, the ideal number of groups would be the unique number of 'sensor location' values

In [None]:

# sensor location counts
unique_locations = df_clean['SensorLocation'].nunique()
print(f"Number of unique sensor locations: {unique_locations}")

location_counts = df_clean['SensorLocation'].value_counts()
print("\nUnique sensor locations and their counts:")
print(location_counts)

# plot a bar chart of the sensor location counts
plt.figure(figsize=(12, 6))
location_counts.plot(kind='bar')
plt.title('Distribution of Sensor Locations')
plt.xlabel('Sensor Location')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


It can be seen from the target feature value count that the target is imbalanced. Not a problem in clustering but for classification this would have to be handled

**Dataset Scaling**
As cluster algorithm utilise distance metrics, we need to ensure that all numeric variables are standardised.

In [None]:

# Scale the numeric columns
scaler = StandardScaler()
df_clean[numeric_cols] = scaler.fit_transform(df_clean[numeric_cols])


In [10]:
# List to hold resuls for comparison
results = []


### Item 1a - Optimal Cluster Count via Elbow Method

Here we use the elbow method to find the optimal number of clusters. The elbow method uses the within cluster sum of squares to find the optimal number of clusters.

In [None]:

max_clusters = 15  

for i in range(1, max_clusters + 1):
    kmeans = KMeans(n_clusters=i, init='random', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(df_clean[numeric_cols])
    wcss.append(kmeans.inertia_)

try:

    kl = KneeLocator(range(1, max_clusters + 1), wcss, curve='convex', direction='decreasing')
    optimal_k = kl.elbow
except ImportError:
    diffs = [wcss[i-1] - wcss[i] for i in range(1, len(wcss))]
    optimal_k = diffs.index(min([d for d in diffs if d > sum(diffs)/len(diffs)/2])) + 2

#Plot the results
plt.figure(figsize=(12, 6))
plt.plot(range(1, max_clusters + 1), wcss, marker='o', linestyle='-')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.grid(True)
plt.xticks(range(1, max_clusters + 1))

plt.axvline(x=unique_locations, color='r', linestyle='--', 
            label=f'Ground Truth: {unique_locations} (SensorLocation)')
plt.axvline(x=optimal_k, color='g', linestyle='--', 
            label=f'Optimal Elbow: {optimal_k}')
plt.legend()
plt.tight_layout()
plt.show()

print(f"Optimal number of clusters determined by elbow method: {optimal_k}")
print(f"Ground truth number of clusters (unique SensorLocation values): {unique_locations}")




The optimal kmeans cluster count obtained by the elbow method was 6. Compared to the ground truth count of 11. (the number of unique classes in sensor location)


Next perform kmeans clustering for all features using bot the elbow method and ground truth count.

In [None]:

print(f"K-means with {optimal_k} clusters (optimal from elbow method):")
perform_kmeans_clustering(df_clean, optimal_k, 'K-means')

print("\nK-means with 11 clusters (ground truth):")
perform_kmeans_clustering(df_clean, 11, 'K-means 11 clusters')


It can been from this that the ground truth count of 11 clusters performed better than the elbow method of 6.

### Item 1b

Feature Reduction's Effect on CLustering Performance

### Automatic feature selection

Perform automatic feature selection to find the best features for clustering.

In [None]:
# Automatic Feature Selection for K-means Clustering

# We'll use SelectKBest with mutual_info_classif to select the most informative features


def evaluate_feature_subset(X, n_features, n_clusters=11):
    # Select top n features
    selector = SelectKBest(score_func=mutual_info_classif, k=n_features)
    X_selected = selector.fit_transform(X, kmeans.labels_)
    
    # Perform k-means clustering
    kmeans_subset = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans_subset.fit_predict(X_selected)
    
    # Calculate ARI score
    score = adjusted_rand_score(df_clean['SensorLocation'], labels)
    
    # Get selected feature names
    selected_features = X.columns[selector.get_support()].tolist()
    
    return score, selected_features

# Try different numbers of features
feature_range = range(2, len(numeric_cols) + 1)
scores = []
best_score = -1
best_features = None

for n_features in feature_range:
    score, features = evaluate_feature_subset(df_clean[numeric_cols], n_features)
    scores.append(score)
    if score > best_score:
        best_score = score
        best_features = features

# Plot results
plt.figure(figsize=(12, 6))
plt.plot(feature_range, scores, marker='o')
plt.xlabel('Number of Features')
plt.ylabel('Adjusted Rand Index')
plt.title('Feature Selection Impact on Clustering Performance')
plt.grid(True)
plt.show()

print(f"Best ARI score: {best_score:.3f}")
print(f"Best performing features ({len(best_features)}):")
for feature in best_features:
    print(f"- {feature}")

# Perform final clustering with best features
X_best = df_clean[best_features]
final_kmeans = KMeans(n_clusters=11, random_state=42)
final_labels = final_kmeans.fit_predict(X_best)
print("\nFinal clustering performance metrics:")
print(f"ARI Score: {adjusted_rand_score(df_clean['SensorLocation'], final_labels):.3f}")


Automatic feature selection showed that Longitude and Latitude were the most important features for clustering. It achieved an ARI score of 1, the highest possible score using Kmeans clustering.

From here on, we will use only the latitude and longitude features for clustering.

In [None]:
df_best = df_clean[best_features + ['SensorLocation']]
numeric_cols_best = ['Latitude', 'Longitude']
df_best.info()


Perform kmeans on the dataframe with only latitude and longitude as features

In [None]:
results.append(perform_kmeans_clustering(df_best, 11, 'K-means 11 clusters', True, 'K-means 11 clusters, latitude and longitude to SensorLocation Cluster '))

A perfect score of 1 was achieved across all metrics.

### Principal Component Analysis
Perform PCA to reduce the dimensionality of the data. We want to see how it compares to automatic feature selection (which produced perfect clustering results). 4 Components will be used.

In [None]:

pca = PCA(n_components=4)
pca_result = pca.fit_transform(df_clean[numeric_cols])
pca_df = pd.DataFrame(
    pca_result,
    columns=['PC1', 'PC2', 'PC3', 'PC4']
)

pca_df['SensorLocation'] = df_clean['SensorLocation'].values

print("Explained variance ratio:")
print(pca.explained_variance_ratio_)
print("\nCumulative explained variance ratio:")
print(np.cumsum(pca.explained_variance_ratio_))


The cumulative explained variance is 61.8% which is on the lower end of the spectrum. Use the principal components to perform kmeans clustering.

In [None]:
perform_kmeans_clustering(pca_df, 11, 'kmeans pca')

An ARI  score of 0.07 was quite poor. PCA did not improve the clustering results and was worse than using the original features.

## Item 2

In this section we will try alternate clustering algorithms other than Kmeans and shaped based algorithms to see how they perform.

### Gaussian Mixture Model

This is a probabilistic model that assumes all the data points are generated from a mixture of several Gaussian distributions.

In [None]:
gmm = GaussianMixture(n_components=11, random_state=42)
gmm_labels = gmm.fit_predict(df_best[numeric_cols_best])

print("Gaussian Mixture Model Clustering Results:")
results.append(evaluate_clustering(df_best['SensorLocation'], gmm_labels, 'Gaussian Mixture Model'))

unique_clusters = np.unique(gmm_labels)
num_clusters = len(unique_clusters)

plt.figure(figsize=(12, 6))

scatter = plt.scatter(df_best['Latitude'], df_best['Longitude'], 
                     c=gmm_labels, cmap='viridis', s=50, alpha=0.8)

legend_elements = [plt.Line2D([0], [0], marker='o', color='w', 
                  markerfacecolor=scatter.cmap(scatter.norm(i)), 
                  markersize=10, label=f'Cluster {i}') 
                  for i in unique_clusters]

plt.legend(handles=legend_elements, title='Clusters', loc='best')
plt.title('Gaussian Mixture Model 11 Clusters, latitude and longitude to SensorLocation Cluster ')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.tight_layout()
plt.show()




The GMM clustering algorithm performed perfected as all metrics scored the highest possible score.

BIRCH

BIRCH is a clustering algorithm that uses a tree structure to cluster data.


In [None]:
birch = Birch(n_clusters=11, 
                threshold=0.5, 
                branching_factor=50)  
cluster_labels = birch.fit_predict(df_best[numeric_cols_best])

results.append(evaluate_clustering(df_best['SensorLocation'], cluster_labels, 'Birch'))

unique_clusters = np.unique(cluster_labels)
num_clusters = len(unique_clusters)

plt.figure(figsize=(12, 6))

scatter = plt.scatter(df_best['Latitude'], df_best['Longitude'], 
                     c=cluster_labels, cmap='viridis', s=50, alpha=0.8)

legend_elements = [plt.Line2D([0], [0], marker='o', color='w', 
                  markerfacecolor=scatter.cmap(scatter.norm(i)), 
                  markersize=10, label=f'Cluster {i}') 
                  for i in unique_clusters]

plt.legend(handles=legend_elements, title='Clusters', loc='best')
plt.title(f'BIRCH {num_clusters} Clusters, latitude and longitude to SensorLocation Cluster ')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.tight_layout()
plt.show()


BIRCH performed poorly with and low ARI score of 0.3115 and a low cluster count of 4.

## Item 3 Shape Based Clustering

In this section we will try shape based clustering algorithms.

### K-Shape

K-Shape is a clustering algorithm that finds the best match for each cluster by using the shape of the data.


In [None]:
X = df_best[numeric_cols_best].values
X = X.reshape(X.shape[0], X.shape[1], 1)

ks = KShape(n_clusters=11, random_state=42)
cluster_labels = ks.fit_predict(X)

results.append(evaluate_clustering(df_best['SensorLocation'], cluster_labels, 'K-Shape'))

plt.figure(figsize=(12, 6))

unique_clusters = np.unique(cluster_labels)
num_clusters = len(unique_clusters)

scatter = plt.scatter(df_best['Latitude'], df_best['Longitude'], 
                     c=cluster_labels, cmap='viridis', s=50, alpha=0.8)

legend_elements = [plt.Line2D([0], [0], marker='o', color='w', 
                  markerfacecolor=scatter.cmap(scatter.norm(i)), 
                  markersize=10, label=f'Cluster {i}') 
                  for i in unique_clusters]

plt.legend(handles=legend_elements, title='Clusters', loc='best')
plt.title(f'K-Shape {num_clusters} Clusters, latitude and longitude to SensorLocation Cluster')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.tight_layout()
plt.show()

K-Shape performed quite well with an ARI score of 0.7619, considering that it only found 8 clusters. Possibly this score would have been higher if more clusters were used.

### OPTICS

OPTICS is desnsity based algorithm, an improvement on DBSCAN.


In [None]:

optics = OPTICS(min_samples=5, xi=0.05, min_cluster_size=10)
cluster_labels = optics.fit_predict(df_best[numeric_cols_best])

results.append(evaluate_clustering(df_best['SensorLocation'], cluster_labels, 'OPTICS'))


unique_clusters = np.unique(cluster_labels)
num_clusters = len(unique_clusters[unique_clusters >= 0])  # Exclude noise points (-1)

plt.figure(figsize=(12, 6))
scatter = plt.scatter(df_best['Latitude'], df_best['Longitude'], 
                     c=cluster_labels, cmap='viridis', s=50, alpha=0.8)

# Add legend for each cluster (excluding noise if present)
legend_elements = []
for i in unique_clusters:
    if i >= 0:  # Skip noise points for the legend
        legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', 
                              markerfacecolor=scatter.cmap(scatter.norm(i)), 
                              markersize=10, label=f'Cluster {i}'))
    elif i == -1:  # Add a special entry for noise points
        legend_elements.append(plt.Line2D([0], [0], marker='o', color='w', 
                              markerfacecolor='lightgrey', 
                              markersize=10, label='Noise'))

plt.legend(handles=legend_elements, title='Clusters', loc='best')
plt.title(f'OPTICS Clustering - Geographical Distribution')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.tight_layout()
plt.show()




OPTICS performed extremely well considering it found 12 clusters. Its ARI score was just shy of perfect coming in at 0.9717

## Item 4 Best Solution

Here we will compare the performance of all the clustering algorithms.

In [None]:

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Display the results table
print("Clustering Algorithm Performance Comparison:")
print(results_df.to_string(index=False))

# Optional: Sort by a specific metric (e.g., ARI) to see best performing algorithms
print("\nAlgorithms sorted by ARI (best to worst):")
print(results_df.sort_values('ari', ascending=False).to_string(index=False))



Kmeans and GMM performed the best with the highest ARI scores. OPTICS was a close third though.

## Item 5 Relationship Amongst Independent Variables

Produce a correlation matrix to see the relationship between the independent variables.

In [None]:


correlation_matrix = df_clean.drop(columns=['SensorLocation', 'Device_id', 'Time']).corr()
plt.figure(figsize=(14, 10))

# Heatmap of the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt='.2f')
plt.title('Correlation Matrix of Environmental Variables', fontsize=16)
plt.tight_layout()
plt.show()

print("Correlation Matrix:")
print(correlation_matrix)

# find the strongest positive and negative correlations
corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j], 
                          correlation_matrix.iloc[i, j]))

corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)

print("\nTop 10 Strongest Correlations:")
for var1, var2, corr in corr_pairs[:10]:
    print(f"{var1} and {var2}: {corr:.3f}")


Some of the independent variable had strong positive and negative correlations. This is discussed in the report.