In [None]:
# # Compute correlations ans variances
# correlations = pg.pairwise_corr(flights_df[numerical_columns])
# variances = flights_df[numerical_columns].var()

# # Select the feature with lower variance in each pair to drop
# to_drop = set()
# for feature1, feature2 in correlations[['X', 'Y']].values:
#     if variances[feature1] < variances[feature2]:
#         to_drop.add(feature1)
#     else:
#         to_drop.add(feature2)

# # Drop the low-variance correlated features
# flights_df = flights_df.drop(columns=to_drop)

# print(f"Features dropped based on variance: {to_drop}")

# flights_df.info()

In [None]:
import pandas as pd

import lightgbm as lgb
import xgboost as xgb

from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [None]:
# Read csv
flights_df = pd.read_csv('./data/flight-delays/flights.csv')

# Show df
flights_df.info()

In [None]:
# Drop columns with more than 20% of values missing
flights_df = flights_df.loc[:, (flights_df.isnull().sum() / flights_df.shape[0] < 0.2).values]

# Show df
flights_df.info()

In [None]:
# Filter out rows where ORIGIN_AIRPORT or DESTINATION_AIRPORT contains numbers
flights_df = flights_df[~flights_df['ORIGIN_AIRPORT'].str.isnumeric().astype(bool)]
flights_df = flights_df[~flights_df['DESTINATION_AIRPORT'].str.isnumeric().astype(bool)]

# Columns to convert to categorical
categorical_columns = [
    'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK',
    'AIRLINE', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
    'DIVERTED', 'CANCELLED'
]

# Convert specified columns to categorical
flights_df[categorical_columns] = flights_df[categorical_columns].astype('category')

# Convert the rest of the columns to numerical
# Identify numerical columns by excluding categorical columns
numerical_columns = flights_df.columns.difference(categorical_columns).to_list()

# Convert remaining columns to numerical (float for precision)
flights_df[numerical_columns] = flights_df[numerical_columns].apply(pd.to_numeric, errors='coerce')

# Drop cancelled flights
flights_df = flights_df[flights_df['CANCELLED'] == 0]
flights_df = flights_df.drop(columns=['CANCELLED'])
categorical_columns.remove('CANCELLED')

# Create target column and drop delayed flights
flights_df['DELAYED'] = flights_df['ARRIVAL_DELAY'].apply(lambda arrival_delay: arrival_delay >=15)
flights_df = flights_df.drop(columns=['ARRIVAL_DELAY'])
numerical_columns.remove('ARRIVAL_DELAY')

# Show df
flights_df.info()

In [None]:
# # Preprocessing for numerical data
# numerical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
#     # ('scaler', StandardScaler())                    # Standardize numerical features
# ])

# # Preprocessing for categorical data
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with mode
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode categorical features
# ])

# preprocessor = ColumnTransformer(
#     [
#         ('num', numerical_transformer, numerical_columns),
#         ('cat', categorical_transformer, categorical_columns)
#     ]
# )

# # Assign sparse array
# X = preprocessor.fit_transform(flights_df.drop(columns=['DELAYED']))
# y = flights_df['DELAYED'].values

# # Create train and test split
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size = 0.3 
# )

In [None]:
# print("XGBoost")

# clf = xgb.XGBClassifier()
# clf.fit(X_train, y_train)

# y_pred = clf.predict(X_test)

# roc_auc = roc_auc_score(y_test, y_pred)
# print(f"ROC AUC: {roc_auc}")

# recall = recall_score(y_test, y_pred)
# print(f"Recall: {recall}")

# f1 = f1_score(y_test, y_pred)
# print(f"F1: {f1}")

In [None]:
# Set Pandas output
set_config(transform_output = "pandas")
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
    # ('scaler', StandardScaler())                   # Standardize numerical features
])
flights_df[numerical_columns] = numerical_transformer.fit_transform(flights_df[numerical_columns])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),                       # Impute missing values with mode
    # ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode categorical features
])
flights_df[categorical_columns] = categorical_transformer.fit_transform(flights_df[categorical_columns])

# Convert specified columns to categorical, again
flights_df[categorical_columns] = flights_df[categorical_columns].astype('category')

# Assign from data frames
X = flights_df.drop(columns=['DELAYED'])
y = flights_df['DELAYED']

# Create train and test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3 
)

# Show df
X.info()

In [None]:
clf = xgb.XGBClassifier(enable_categorical=True)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC AUC: {roc_auc}")

recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")

f1 = f1_score(y_test, y_pred)
print(f"F1: {f1}")