# Aircraft Crashes Data Clustering

## Overview

This notebook explores different clustering algorithms and compares the different results. The data has already been cleaned in the [EDA phase](data_analysis.ipynb).

## Data Preprocessing

In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
import scipy.cluster.hierarchy as sch

In [None]:
# Load data
with open('data/crashes_cleaned_data2.pkl', 'rb') as handle:
	df = pickle.load(handle)

In [None]:
df.head()

### Remove unnecessary columns

In [None]:
# Registration number and manufacture serial number are unique identifiers of an aircraft
df = df.drop(['registration', 'msn'], axis=1)

In [None]:
# Split date in three columns: year, month, day
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df = df.drop('date', axis=1)

In [None]:
# Reduce cardinality of categorical values
columns = df.select_dtypes(include=['object']).columns

for column in columns:
  values = list(df[column].value_counts().head(4).index)
  df[column] = np.where(~df[column].isin(values), 'Other', df[column])

In [None]:
# Use Ordinal Encoding for aircraft_damage 
df['aircraft_damage'].cat.categories

In [None]:
ordinal_encoder = OrdinalEncoder(categories=df['aircraft_damage'].cat.categories)
encoded_col = ordinal_encoder.fit_transform(df['aircraft_damage'])
#df['aircraft_damage'] = encoded_col

In [None]:
# Use One Hot Encoding for other columns
string_columns = []
onehot_encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_cols = onehot_encoder.fit_transform(df[string_columns])
new_df = pd.DataFrame(encoded_cols, columns=onehot_encoder.get_feature_names_out(string_columns))
df = df.drop(columns=string_columns, axis=1).join(new_df)

In [None]:
# Assert all columns are numeric
assert len(df.columns) == len(df.select_dtypes([np.number]).columns)

In [None]:
# Pairplot
sns.pairplot(data=df);

In [None]:
# Get correlation between variables
corr_matrix = df.corr(numeric_only=True)
sns.heatmap(corr_matrix, annot=True);

In [None]:
# Clustermap
sns.clustermap(corr_matrix, annot=True);

In [None]:
# Export data
df.to_csv('data/crashes_preprocessed_data.csv', index=False)

## Feature Engineering

In [None]:
X = df.copy()

In [None]:
# Use PCA to reduce dimensionality to 10
pca = PCA(n_components=10, whiten=True)
pca.fit(X)

In [None]:
print(pca.components_)

In [None]:
print(pca.explained_variance_)

In [None]:
# Export data
X.to_csv('data/preprocessed_data.csv', index=False)

## Data Modeling

In [None]:
# Scale X
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X = pd.DataFrame(scaled_X, columns=scaler.get_feature_names_out())
scaled_X.sample(5)

### K-Means

#### Find clusters

In [None]:
# Create multiple models
ssd = []
for k in range(2, 31):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_X)
    labels = kmeans.labels_
    ssd.append({
        'k': k,
        'inertia': kmeans.inertia_,
        'silhouette': silhouette_score(scaled_X, labels, metric='euclidean')
	})

models = pd.DataFrame(ssd)
models['difference'] = models['inertia'].diff()

In [None]:
# Elbow method plot
plt.plot(models['k'], models['inertia'], marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia (SSD)')
plt.title('Elbow Method for Optimal K')
plt.show()

In [None]:
# Silhouette method plot
plt.plot(models['k'], models['silhouette'], marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method for Optimal K')
plt.show()

In [None]:
# Best k value
kmeans = KMeans(n_clusters=13, random_state=42)
kmeans.fit(scaled_X)

In [None]:
original_df = pd.read_csv('data/crashes_clean_data.csv')
original_df.insert(1, 'Cluster', kmeans.labels_)

#### Analyse each cluster

bla bla bla

### Hierarchical Clustering

In [None]:
# Plot dendogram for optimal cluster determination
plt.figure(figsize=(15,6))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.hlines(y=190,xmin=0,xmax=2000,lw=3,linestyles='--')
plt.text(x=900,y=220,s='Horizontal line crossing 5 vertical lines',fontsize=20)
#plt.grid(True)
dendrogram = sch.dendrogram(sch.linkage(X, method = 'ward'))
plt.show()

In [None]:
# Model
ac = AgglomerativeClustering(n_clusters= 5, metric='euclidean', linkage='ward')
ac.fit(X)

In [None]:
original_df['Cluster'] = ac.labels_

#### Analyse clusters

bla bla bla

### DBSCAN

In [None]:
models = []
min_samples = 2 * len(scaled_X.columns)

for eps in np.linspace(0.001, 3, 50):
	dbscan = DBSCAN(eps=eps, min_samples=min_samples)
	dbscan.fit(scaled_X)

	unique, counts = np.unique(dbscan.labels_, return_counts=True)
	freqs = dict(zip(unique, counts))

	percentage = freqs[-1] / len(scaled_X)
	
	models.append({'epsilon': eps, 'outliers': percentage})

models_df = pd.DataFrame(models)

In [None]:
# Outlier points vs. eps value lineplot
sns.lineplot(data=models_df, x='epsilon', y='outliers');

In [None]:
# DBSCAN with chosen epsilon value
model = DBSCAN(eps=2, min_samples=min_samples)
model.fit(scaled_X)

In [None]:
# Add cluster to dataframe
original_df['Cluster'] = model.labels_

#### Analyse clusters

bla bla bla

### Compare the different clusterings

bla bla bla

## End