# Importing the data

In [11]:
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# For clustering
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

In [12]:
# Read CSV files from 'data' directory
csv_files = glob.glob(os.path.join("data", "*.csv"))

In [13]:
# Generate Pandas DataFrames from CSV files
df_all = [pd.read_csv(file) for file in csv_files]

In [14]:
# Concatenate DataFrames
df = pd.concat(df_all, ignore_index=True)

# Missing values

In [15]:
# Drop 'tail_number' column (due to missing values and insignificance)
df = df.drop(['tail_number'], axis=1)

In [16]:
# Drop rows with any missing values across all columns other than the departure and arrival columns (cancelled flights)
df = df.dropna(subset=df.columns.difference(['actual_departure_dt', 'actual_arrival_dt']))

# Data Types

In [17]:
# Convert date columns to datetime format
date_columns = ['date', 'scheduled_departure_dt', 'scheduled_arrival_dt', 'actual_departure_dt', 'actual_arrival_dt']

for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

In [18]:
# Add categorised delay time features to data
from utils import categorise_delay


df['departure_delay_category'] = df['departure_delay'].apply(categorise_delay)
df['arrival_delay_category'] = df['arrival_delay'].apply(categorise_delay)

In [19]:
# Add categorised weather condition features to data
from utils import categorise_weather
import numpy as np

station_columns = ['STATION_x', 'STATION_y']
weather_columns = ['HourlyDryBulbTemperature_x', 'HourlyPrecipitation_x', 'HourlyStationPressure_x', 'HourlyVisibility_x', 'HourlyWindSpeed_x', 'HourlyDryBulbTemperature_y', 'HourlyPrecipitation_y', 'HourlyStationPressure_y', 'HourlyVisibility_y', 'HourlyWindSpeed_y']

for station_col in station_columns:
    for weather_col in weather_columns:
        df = categorise_weather(df, weather_col, station_col)

NameError: name 'np' is not defined

In [None]:
# Removing duplicate rows
df.drop_duplicates(inplace=True)

# Clustering

In [None]:
weather_features = df[['HourlyDryBulbTemperature_x', 'HourlyPrecipitation_x', 'HourlyStationPressure_x', 'HourlyVisibility_x', 'HourlyWindSpeed_x',
                       'HourlyDryBulbTemperature_y', 'HourlyPrecipitation_y', 'HourlyStationPressure_y', 'HourlyVisibility_y', 'HourlyWindSpeed_y']]
scaler = StandardScaler()
scaled_weather_features = scaler.fit_transform(weather_features)

In [None]:
# Elbow Method for K-Means
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(scaled_weather_features)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
# K-Means Clustering
# Choose the number of clusters from the Elbow Method
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans_labels = kmeans.fit_predict(scaled_weather_features)
df['kmeans_cluster'] = kmeans_labels

# Hierarchical Clustering
# Plotting the dendogram to find the number of clusters
plt.figure(figsize=(10,7))
dendogram = sch.dendogram(sch.linkage(scaled_weather_features, method='ward'))
plt.title('Dendogram')
plt.xlabel('Flights')
plt.ylabel('Euclidean distances')
plt.show()

In [None]:
# Is number of clusters correct based on Dendogram??
hc_labels = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
df['hierarchical_cluster'] = hc_labels

In [None]:
# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5) # Adjust eps and min_samples
dbscan_labels = dbscan.fit_predict(scaled_weather_features)
df['dbscan_cluster'] = dbscan_labels

In [None]:
# Analysis of clustering
print("K-Means Clustering")
print(df.groupby('kmeans_cluster').mean())

print("\nHierarchical Clustering Results")
print(df.groupby('kmeans_cluster').mean())

print("\nHierarchical Clustering Results")
print(df.groupby('hierarchical_cluster').mean())

# NOTE: for DBSCAN '-1' labels indicate noise points