In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.decomposition import PCA


from kneed import KneeLocator


# Task 6.2 Investigation of Microclimate Sensors Data



Load the data and display the schema

In [None]:


df = pd.read_csv('microclimate-sensors-data.csv')

df.info()

Most of the features do not agree on the non-null row count so preprocess is required to impute missing values and restrict the dataset to the number of non-null rows of the SensorLocation target feature

In [None]:
print(f"Number of non-null values in SensorLocation: {df['SensorLocation'].count()}")

print("\nMissing values in each column:")
print(df.isnull().sum())

# Restrict to non-null SensorLocation
df_clean = df.dropna(subset=['SensorLocation', 'LatLong'])
print(f"\nShape after restricting to non-null SensorLocation: {df_clean.shape}")

# Split LatLong into separate Latitude and Longitude columns
df_clean[['Latitude', 'Longitude']] = df_clean['LatLong'].str.split(',', expand=True).astype(float)
df_clean.drop(columns=['LatLong'], inplace=True)

# Drop all columns except SensorLocation, Latitude, and Longitude
#df_clean = df_clean[['SensorLocation', 'Latitude', 'Longitude']]



# Approach for missing values differs for numeric and categorical columns
numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df_clean.select_dtypes(include=['object']).columns.drop('SensorLocation') if 'SensorLocation' in df_clean.columns else df_clean.select_dtypes(include=['object']).columns

# Impute missing values
for col in numeric_cols:
    if df_clean[col].isnull().sum() > 0:
        mean_val = df_clean[col].mean()
        df_clean[col].fillna(mean_val, inplace=True)

for col in categorical_cols:
    if df_clean[col].isnull().sum() > 0:
        mode_val = df_clean[col].mode()[0]
        df_clean[col].fillna(mode_val, inplace=True)

# Check the results
print("\nMissing values after imputation:")
print(df_clean.isnull().sum())


In [None]:
df_clean.info()

In [None]:
df_clean.head()

### Evaluation Metrics Function

Here we define the evaluation metrics


In [17]:

def evaluate_clustering(true_labels, pred_labels):
    """
    Print clustering evaluation metrics: ARI, NMI, and Silhouette Score (if possible).
    Optionally prints number of clusters and noise points for DBSCAN-like algorithms.
    """

    
    print("Clustering Evaluation Results:")
    
    # ARI
    ari = adjusted_rand_score(true_labels, pred_labels)
    print(f"Adjusted Rand Index (ARI): {ari:.4f}")
    
    # NMI
    nmi = normalized_mutual_info_score(true_labels, pred_labels)
    print(f"Normalized Mutual Information (NMI): {nmi:.4f}")
    
    # Silhouette Score (only if features and at least 2 clusters)
    n_clusters = len(set(pred_labels)) - (1 if -1 in pred_labels else 0)

    
    # Calculate purity score
    contingency_matrix = metrics.cluster.contingency_matrix(true_labels, pred_labels)
    purity = np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix)
    print(f"Purity Score: {purity:.4f}")
    
    # For DBSCAN-like: print number of clusters and noise points
    if -1 in pred_labels:
        noise_points = list(pred_labels).count(-1)
        print(f"Number of clusters (excluding noise): {n_clusters}")
        print(f"Noise points: {noise_points} ({noise_points/len(pred_labels)*100:.2f}%)")
    else:
        print(f"Number of clusters: {n_clusters}")
    print()

    # Create dictionary to store metrics
    metrics_dict = {
        'ari': ari,
        'nmi': nmi,
        'purity': purity,
        'n_clusters': n_clusters
    }
    
    # Add noise points metrics if applicable
    if -1 in pred_labels:
        noise_points = list(pred_labels).count(-1)
        metrics_dict['noise_points'] = noise_points
        metrics_dict['noise_percentage'] = noise_points/len(pred_labels)*100
        
    # # Add silhouette score if possible
    # if features is not None and n_clusters >= 2:
    #     try:
    #         silhouette_avg = silhouette_score(features, pred_labels)
    #         print(f"Silhouette Score: {silhouette_avg:.4f}")
    #         metrics_dict['silhouette'] = silhouette_avg
    #     except:
    #         pass
            
    return metrics_dict


### Perform Kmeans Clustering Function

In [18]:
def perform_kmeans_clustering(input_dataframe, n_clusters):
    """
    Performs KMeans clustering on the input dataframe and evaluates the results.
    
    Args:
        input_dataframe: DataFrame containing the features to cluster
        n_clusters: Number of clusters to create
    """
    
    # Get numeric columns only
    numeric_cols = input_dataframe.select_dtypes(include=['float64', 'int64']).columns
    
    # Initialize and fit KMeans on numeric columns only
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(input_dataframe[numeric_cols])
    
    # Get true labels from SensorLocation
    true_labels = input_dataframe.index.get_level_values('SensorLocation') if isinstance(input_dataframe.index, pd.MultiIndex) else input_dataframe['SensorLocation']
    
    # Evaluate clustering
    print(f"\nKMeans Clustering Results (k={n_clusters}):")
    print("-" * 40)
    evaluate_clustering(true_labels, cluster_labels)
    
    return cluster_labels, kmeans


## Item 1 Optimal Number of Groups

Here we aim to answer what is the optimal number of groups and what effect dimensionality reduction has on clustering.

### Item 1a - Unique Number of Target Classes

Since we have the ground truth values in a categorical value already, the ideal number of groups would be the unique number of 'sensor location' values

In [None]:

# sensor location counts
unique_locations = df_clean['SensorLocation'].nunique()
print(f"Number of unique sensor locations: {unique_locations}")

location_counts = df_clean['SensorLocation'].value_counts()
print("\nUnique sensor locations and their counts:")
print(location_counts)

# plot a bar chart of the sensor location counts
plt.figure(figsize=(12, 6))
location_counts.plot(kind='bar')
plt.title('Distribution of Sensor Locations')
plt.xlabel('Sensor Location')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


**Dataset Scaling**
As cluster algorithm utilise distance metrics, we need to ensure that all numeric variables are standardised.

In [None]:

# Scale the numeric columns
scaler = StandardScaler()
df_clean[numeric_cols] = scaler.fit_transform(df_clean[numeric_cols])


### Item 1a - Optimal Cluster Count via Elbow Method

In [None]:
# Import KMeans from sklearn


# Calculate the within-cluster sum of squares (WCSS) for different numbers of clusters
wcss = []
max_clusters = 15  # Try up to 15 clusters

for i in range(1, max_clusters + 1):
    kmeans = KMeans(n_clusters=i, init='random', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(df_clean[numeric_cols])
    wcss.append(kmeans.inertia_)

# Find the optimal number of clusters using the elbow method
# We'll use the KneeLocator from kneed package if available
try:

    kl = KneeLocator(range(1, max_clusters + 1), wcss, curve='convex', direction='decreasing')
    optimal_k = kl.elbow
except ImportError:
    # If kneed is not available, we'll use a simple heuristic
    # Calculate the rate of change in WCSS
    diffs = [wcss[i-1] - wcss[i] for i in range(1, len(wcss))]
    # Find where the rate of change starts to slow down significantly
    optimal_k = diffs.index(min([d for d in diffs if d > sum(diffs)/len(diffs)/2])) + 2

# Plot the Elbow Method graph
plt.figure(figsize=(12, 6))
plt.plot(range(1, max_clusters + 1), wcss, marker='o', linestyle='-')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.grid(True)
plt.xticks(range(1, max_clusters + 1))

# Add vertical lines for both ground truth and optimal elbow point
plt.axvline(x=unique_locations, color='r', linestyle='--', 
            label=f'Ground Truth: {unique_locations} (SensorLocation)')
plt.axvline(x=optimal_k, color='g', linestyle='--', 
            label=f'Optimal Elbow: {optimal_k}')
plt.legend()
plt.tight_layout()
plt.show()

print(f"Optimal number of clusters determined by elbow method: {optimal_k}")
print(f"Ground truth number of clusters (unique SensorLocation values): {unique_locations}")




The optimal kmeans cluster count obtained by the elbow method was 5. Compare it to 7 (the number of unique classes in sensor location)


In [None]:
# Evaluate clustering results
print(f"K-means with {optimal_k} clusters (optimal from elbow method):")
perform_kmeans_clustering(df_clean, optimal_k)

print("\nK-means with 11 clusters (ground truth):")
perform_kmeans_clustering(df_clean, 11)


### Item 1b

Feature Reduction's Effect on CLustering Performance

### Automatic feature selection

In [None]:
# Automatic Feature Selection for K-means Clustering

# We'll use SelectKBest with mutual_info_classif to select the most informative features


def evaluate_feature_subset(X, n_features, n_clusters=11):
    # Select top n features
    selector = SelectKBest(score_func=mutual_info_classif, k=n_features)
    X_selected = selector.fit_transform(X, kmeans.labels_)
    
    # Perform k-means clustering
    kmeans_subset = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans_subset.fit_predict(X_selected)
    
    # Calculate ARI score
    score = adjusted_rand_score(df_clean['SensorLocation'], labels)
    
    # Get selected feature names
    selected_features = X.columns[selector.get_support()].tolist()
    
    return score, selected_features

# Try different numbers of features
feature_range = range(2, len(numeric_cols) + 1)
scores = []
best_score = -1
best_features = None

for n_features in feature_range:
    score, features = evaluate_feature_subset(df_clean[numeric_cols], n_features)
    scores.append(score)
    if score > best_score:
        best_score = score
        best_features = features

# Plot results
plt.figure(figsize=(12, 6))
plt.plot(feature_range, scores, marker='o')
plt.xlabel('Number of Features')
plt.ylabel('Adjusted Rand Index')
plt.title('Feature Selection Impact on Clustering Performance')
plt.grid(True)
plt.show()

print(f"Best ARI score: {best_score:.3f}")
print(f"Best performing features ({len(best_features)}):")
for feature in best_features:
    print(f"- {feature}")

# Perform final clustering with best features
X_best = df_clean[best_features]
final_kmeans = KMeans(n_clusters=11, random_state=42)
final_labels = final_kmeans.fit_predict(X_best)
print("\nFinal clustering performance metrics:")
print(f"ARI Score: {adjusted_rand_score(df_clean['SensorLocation'], final_labels):.3f}")


In [None]:
df_best = df_clean[best_features + ['SensorLocation']]
numeric_cols_best = ['Latitude', 'Longitude']
df_best.info()


Perform kmeans on the dataframe with only latitude and longitude as features

In [None]:
perform_kmeans_clustering(df_best, 11)

### Principal Component Analysis

In [None]:
# Perform PCA on numeric columns while preserving SensorLocation


# Create PCA object with 4 components
pca = PCA(n_components=4)

# Fit and transform numeric data
pca_result = pca.fit_transform(df_clean[numeric_cols])

# Create new dataframe with PCA results
pca_df = pd.DataFrame(
    pca_result,
    columns=['PC1', 'PC2', 'PC3', 'PC4']
)

# Add back SensorLocation
pca_df['SensorLocation'] = df_clean['SensorLocation'].values

# Print explained variance ratio
print("Explained variance ratio:")
print(pca.explained_variance_ratio_)
print("\nCumulative explained variance ratio:")
print(np.cumsum(pca.explained_variance_ratio_))


In [None]:
perform_kmeans_clustering(pca_df, 11)

## Item 2

Alternate solutions

DBSCAN

In [28]:
# # Implementing DBSCAN for cluster discovery
# from sklearn.cluster import DBSCAN
# from sklearn.neighbors import NearestNeighbors
# import numpy as np
# import matplotlib.pyplot as plt
# from collections import Counter

# # Function to find optimal epsilon using k-distance graph
# def find_optimal_eps(data, k=5):
#     # Calculate distances to k nearest neighbors for each point
#     neigh = NearestNeighbors(n_neighbors=k)
#     neigh.fit(data)
#     distances, _ = neigh.kneighbors(data)
    
#     # Sort distances to kth neighbor in ascending order
#     k_distances = np.sort(distances[:, k-1])
    
#     # Plot k-distance graph
#     plt.figure(figsize=(12, 6))
#     plt.plot(range(len(k_distances)), k_distances)
#     plt.xlabel('Data Points (sorted by distance)')
#     plt.ylabel(f'Distance to {k}th Nearest Neighbor')
#     plt.title('K-Distance Graph for DBSCAN Epsilon Parameter Selection')
    
#     # Add a grid to help identify the "elbow"
#     plt.grid(True)
#     plt.show()
    
#     return k_distances

# # Find optimal epsilon value
# k_distances = find_optimal_eps(df_best[numeric_cols_best])


# # Common eps values to try, ranging from small to large neighborhood sizes
# #eps_values = [0.1, 0.2, 0.4, 0.5, 0.75, 1.0]
# eps_values = [0.75]

# # Common min_samples values to try, representing different density requirements
# #min_samples_values = [10, 15, 20]
# min_samples_values = [20]

# results = []

# for eps in eps_values:
#     for min_samples in min_samples_values:
#         # Apply DBSCAN
#         dbscan = DBSCAN(eps=eps, min_samples=min_samples)
#         cluster_labels = dbscan.fit_predict(df_best[numeric_cols_best])
        
#         # Count number of clusters (excluding noise points labeled as -1)
#         n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
#         noise_points = list(cluster_labels).count(-1)
        
#         # Calculate evaluation metrics using evaluate_clustering function
#         if n_clusters > 0:  # Only calculate metrics if clusters were found
#             metrics = evaluate_clustering(ground_truth_labels, cluster_labels)
#         else:
#             print("No clusters found")
        




Hierachical Clustering

In [None]:
# Apply BIRCH Clustering (memory efficient hierarchical clustering)
from sklearn.cluster import Birch

# Try different numbers of clusters 
n_clusters_values = [11]  # Including 11 since KMeans worked well with it
results_hierarchical = []

for n_clusters in n_clusters_values:
    # Apply BIRCH clustering
    # threshold and branching_factor control memory usage
    birch = Birch(n_clusters=n_clusters, 
                  threshold=0.5,  # Controls subcluster diameter threshold
                  branching_factor=50)  # Controls number of subclusters per node
    cluster_labels = birch.fit_predict(df_best[numeric_cols_best])
    
    # Calculate evaluation metrics
    evaluate_clustering(df_best['SensorLocation'], cluster_labels)
    


## Item 3 Shape Based Clustering

K-shape

In [None]:
# Import required libraries
from tslearn.clustering import KShape
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

# Prepare time series data
# Select numeric columns for time series analysis
ts_cols = ['AirTemperature', 'RelativeHumidity', 'AtmosphericPressure', 'PM25', 'PM10', 'Noise']
ts_data = df_clean[ts_cols].values

# Reshape data for k-shape clustering (samples, timestamps, features)
ts_data = ts_data.reshape(len(ts_data), 1, -1)

# Scale the time series data
scaler = TimeSeriesScalerMeanVariance()
ts_data_scaled = scaler.fit_transform(ts_data)

# Initialize lists to store results
results_kshape = []

ks = KShape(n_clusters=11, random_state=42)
cluster_labels = ks.fit_predict(ts_data_scaled)

evaluate_clustering(df_best['SensorLocation'], cluster_labels)




Shapelet-based Time Series Clustering:

In [None]:
# Import required libraries
from tslearn.shapelets import ShapeletModel
from sklearn.preprocessing import LabelEncoder

# Prepare the data
X = df_clean[ts_cols].values
y = df_clean['SensorLocation'].values  
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Initialize and train the shapelet model
n_shapelets_per_size = {
    "n_shapelets_per_size": 5,  # Number of shapelets to learn per shapelet length
    "min_shapelet_length": 3,    # Minimum length of shapelets
    "max_shapelet_length": 10    # Maximum length of shapelets
}

shapelet_model = ShapeletModel(
    n_shapelets_per_size=n_shapelets_per_size,
    optimizer='sgd',
    weight_regularizer=.01,
    max_iter=1000,
    random_state=42
)

# Fit the model
shapelet_model.fit(X, y_encoded)

# Get the learned shapelets
shapelets = shapelet_model.shapelets_

# Transform the data using learned shapelets
X_transformed = shapelet_model.transform(X)

# Apply KMeans clustering on the transformed data
n_clusters = best_n_clusters  # Using the same number of clusters as before
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
shapelet_clusters = kmeans.fit_predict(X_transformed)

evaluate_clustering(df_best['SensorLocation'], shapelet_clusters)
