In [82]:
!pip install imbalanced-learn



In [83]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', None)

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

## Load all datasets

In [84]:
submission_csv = pd.read_csv('test_data/CIS_662 _INITIAL_Predictions.csv', keep_default_na=False)
former_flights_data = pd.read_csv('dataset/merged_data/former_flight_data.csv')
latter_flight_data = pd.read_csv('dataset/merged_data/latter_flight_data.csv')
test_data = pd.read_csv('test_data/initial_test_data_for_prediction.csv')


## Preprocess all datasets

In [85]:
# Drop NaN for latter flight data
if True:
    latter_flight_data.dropna(subset=['FORMER_FLIGHT_STATUS'], inplace=True)

In [86]:
def categorize_delay(delay):
    if delay < -7:
        return 0 #early
    elif delay > 6:
        return 2 #late
    else:
        return 1 #ontime

In [87]:
X_former_flights_data = former_flights_data.drop(columns=['ARR_DELAY'])
y_former_flights_data = former_flights_data['ARR_DELAY'].apply(categorize_delay)

X_latter_flight_data = latter_flight_data.drop(columns=['ARR_DELAY'])
y_latter_flight_data = latter_flight_data['ARR_DELAY'].apply(categorize_delay)

In [88]:
# Handle Categorical Variables
categorical_vars = ['DAY_OF_WEEK', 'MKT_UNIQUE_CARRIER',
                    'OP_UNIQUE_CARRIER', 'ORIGIN',
                    'ORGIN_WTH_precipprob', 'ORGIN_WTH_severerisk',
                    'DEST_WTH_precipprob', 'DEST_WTH_severerisk',
                    'FORMER_FLIGHT_STATUS', 'MONTH']

In [89]:
def preprocess(flight_data: pd.DataFrame):

    # Dealing with date and time
    flight_data['SCH_ARR_TIME'] = pd.to_datetime(flight_data['SCH_ARR_TIME'])
    flight_data['SCH_DEP_TIME'] = pd.to_datetime(flight_data['SCH_DEP_TIME'])

    flight_data['MONTH'] = flight_data['SCH_ARR_TIME'].dt.month
    flight_data['DAY'] = flight_data['SCH_ARR_TIME'].dt.day
    flight_data['DEP_MINUTES'] = flight_data['SCH_DEP_TIME'].dt.hour * 60 + flight_data['SCH_DEP_TIME'].dt.minute
    flight_data['ARR_MINUTES'] = flight_data['SCH_ARR_TIME'].dt.hour * 60 + flight_data['SCH_ARR_TIME'].dt.minute

    flight_data.drop(columns=['SCH_DEP_TIME', 'SCH_ARR_TIME'], inplace=True)

    # Dropping unwanted columns
    cols = [
        'ORGIN_WTH_temp', 'DEST_WTH_temp',
        'DEST_WTH_severerisk', 'ORGIN_WTH_severerisk',
        'DEST_WTH_precipprob', 'ORGIN_WTH_precipprob'
        ]
    flight_data.drop(columns=cols, inplace=True)
    
    cat_col = list(set(flight_data.columns).intersection(categorical_vars))
    flight_data = pd.get_dummies(flight_data, columns = list(cat_col), drop_first = False)

    return flight_data
    

In [90]:
X_former_flights_data = preprocess(X_former_flights_data)
X_latter_flight_data = preprocess(X_latter_flight_data)
test_data = preprocess(test_data)

In [91]:
# Trying SMOTE
if False:
    import imblearn
    from imblearn.over_sampling import SMOTE
    from imblearn.pipeline import Pipeline
    from imblearn.under_sampling import RandomUnderSampler
    from collections import Counter

    over = SMOTE()
    under = RandomUnderSampler()
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)

    X_former_flights_data, y_former_flights_data = pipeline.fit_resample(X_former_flights_data, y_former_flights_data)
    X_latter_flight_data, y_latter_flight_data = pipeline.fit_resample(X_latter_flight_data, y_latter_flight_data)

In [92]:
# Get missing columns in the prediction data
missing_cols = set(X_former_flights_data.columns) - set(test_data.columns)
# Add a zero column for missing columns in prediction data
for c in missing_cols:
    test_data[c] = 0

# Ensure the order of columns in prediction data matches that of flight_data_encoded
test_data = test_data[X_former_flights_data.columns]
test_data.columns
test_data.shape
# Now, prediction_data_encoded should have the same columns as flight_data_encoded

Index(['ORGIN_WTH_precip', 'ORGIN_WTH_snow', 'ORGIN_WTH_windspeed',
       'ORGIN_WTH_winddir', 'ORGIN_WTH_cloudcover', 'ORGIN_WTH_visibility',
       'DEST_WTH_precip', 'DEST_WTH_snow', 'DEST_WTH_windspeed',
       'DEST_WTH_winddir', 'DEST_WTH_cloudcover', 'DEST_WTH_visibility', 'DAY',
       'DEP_MINUTES', 'ARR_MINUTES', 'OP_UNIQUE_CARRIER_9E',
       'OP_UNIQUE_CARRIER_B6', 'OP_UNIQUE_CARRIER_G7', 'OP_UNIQUE_CARRIER_MQ',
       'OP_UNIQUE_CARRIER_OO', 'OP_UNIQUE_CARRIER_PT', 'OP_UNIQUE_CARRIER_UA',
       'OP_UNIQUE_CARRIER_WN', 'OP_UNIQUE_CARRIER_YX', 'OP_UNIQUE_CARRIER_ZW',
       'DAY_OF_WEEK_1', 'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4',
       'DAY_OF_WEEK_5', 'DAY_OF_WEEK_6', 'DAY_OF_WEEK_7', 'ORIGIN_JFK',
       'ORIGIN_MCO', 'ORIGIN_ORD', 'MKT_UNIQUE_CARRIER_AA',
       'MKT_UNIQUE_CARRIER_B6', 'MKT_UNIQUE_CARRIER_DL',
       'MKT_UNIQUE_CARRIER_UA', 'MKT_UNIQUE_CARRIER_WN', 'MONTH_1', 'MONTH_2',
       'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6', 'MONTH_7', 'MONTH_8

(23, 52)

## Train Former Flight and Latter Flight Model

In [93]:
former_flight_model = xgb.XGBClassifier(eta = '0.08', max_depth=5, min_child_weight=1, reg_lambda=0.007)
former_flight_model = xgb.XGBClassifier(learning_rate=0.01, max_depth=1, min_child_weight=2, n_estimators=600, reg_lambda=0.007)

former_flight_model = former_flight_model.fit(X_former_flights_data, y_former_flights_data)
former_flight_model.score(X_former_flights_data, y_former_flights_data)
feat_imp_former = pd.Series(former_flight_model.feature_importances_, X_former_flights_data.columns.values).sort_values(ascending=False)
feat_imp_former.head(15)



latter_flight_model = xgb.XGBClassifier(eta = '0.08', max_depth=5, min_child_weight=1, reg_lambda=0.007)
latter_flight_model = xgb.XGBClassifier(learning_rate=0.01, max_depth=1, min_child_weight=2, n_estimators=600, reg_lambda=0.007)

latter_flight_model = latter_flight_model.fit(X_latter_flight_data, y_latter_flight_data)
latter_flight_model.score(X_latter_flight_data, y_latter_flight_data)
feat_imp_latter = pd.Series(latter_flight_model.feature_importances_, X_latter_flight_data.columns.values).sort_values(ascending=False)
feat_imp_latter.head(15)


0.47526945223682265

MKT_UNIQUE_CARRIER_DL   0.17
OP_UNIQUE_CARRIER_B6    0.14
ORGIN_WTH_visibility    0.09
ORGIN_WTH_precip        0.08
DEST_WTH_visibility     0.06
ORIGIN_ORD              0.06
DEP_MINUTES             0.06
ORGIN_WTH_snow          0.05
ORGIN_WTH_cloudcover    0.05
DEST_WTH_snow           0.04
OP_UNIQUE_CARRIER_OO    0.03
DEST_WTH_precip         0.03
ARR_MINUTES             0.02
DEST_WTH_winddir        0.02
OP_UNIQUE_CARRIER_MQ    0.02
dtype: float32

0.47485207100591714

OP_UNIQUE_CARRIER_B6         0.14
ORGIN_WTH_visibility         0.11
MKT_UNIQUE_CARRIER_DL        0.10
DEST_WTH_visibility          0.08
ORGIN_WTH_cloudcover         0.07
DEP_MINUTES                  0.07
FORMER_FLIGHT_STATUS_early   0.07
ORGIN_WTH_precip             0.06
ORGIN_WTH_snow               0.05
OP_UNIQUE_CARRIER_MQ         0.04
DEST_WTH_precip              0.04
ARR_MINUTES                  0.04
DEST_WTH_winddir             0.03
DAY                          0.03
MONTH_10                     0.02
dtype: float32

## Make Predictions and write to csv

In [94]:
status_dic = {0: 'early', 1: 'ontime', 2: 'late'}
for index, sub_row in submission_csv.iterrows():
    test_row = test_data.iloc[index].copy()

    # Predict Former
    former = status_dic[former_flight_model.predict([test_row])[0]]
    if sub_row['ARRIVAL STATUS'] != 'NA':
        sub_row['ARRIVAL STATUS'] = former

    # Predict Latter - Former Early
    test_row['FORMER_FLIGHT_STATUS_late'] = 0
    test_row['FORMER_FLIGHT_STATUS_on-time'] = 0
    test_row['FORMER_FLIGHT_STATUS_early'] = 1
    early = status_dic[latter_flight_model.predict([test_row])[0]]
    if sub_row['ARRIVAL STATUS_Prev_flight_early'] != 'NA':
        sub_row['ARRIVAL STATUS_Prev_flight_early'] = early

    # Predict Latter - Former ontime
    test_row['FORMER_FLIGHT_STATUS_late'] = 0
    test_row['FORMER_FLIGHT_STATUS_on-time'] = 1
    test_row['FORMER_FLIGHT_STATUS_early'] = 0
    ontime = status_dic[latter_flight_model.predict([test_row])[0]]
    if sub_row['ARRIVAL STATUS_Prev_flight_ontime'] != 'NA':
        sub_row['ARRIVAL STATUS_Prev_flight_ontime'] = ontime
    
    # Predict Latter - Former late
    test_row['FORMER_FLIGHT_STATUS_late'] = 1
    test_row['FORMER_FLIGHT_STATUS_on-time'] = 0
    test_row['FORMER_FLIGHT_STATUS_early'] = 0
    late = status_dic[latter_flight_model.predict([test_row])[0]]
    if sub_row['ARRIVAL STATUS_Prev_flight_late'] != 'NA':
        sub_row['ARRIVAL STATUS_Prev_flight_late'] = late

submission_csv.head(24)
    

Unnamed: 0,DATE,DAY,FLIGHT NUMBER,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,4/10/24,WEDNESDAY,UA 1400,UA,UA,ORD,6:52 PM,9:47 PM,early,,,
1,4/10/24,WEDNESDAY,AA 3402,AA,MQ,ORD,7:59 PM,10:52 PM,,early,early,early
2,4/10/24,WEDNESDAY,B6 116,B6,B6,JFK,1:33 PM,2:50 PM,late,,,
3,4/10/24,WEDNESDAY,DL 5182,DL,9E,JFK,2:55 PM,4:21 PM,,early,early,early
4,4/10/24,WEDNESDAY,WN 5285,WN,WN,MCO,11:05 AM,1:45 PM,early,,,
5,4/10/24,WEDNESDAY,B6 656,B6,B6,MCO,1:35 PM,4:25 PM,,late,late,late
6,4/11/24,THURSDAY,UA 1400,UA,UA,ORD,6:52 PM,9:47 PM,early,,,
7,4/11/24,THURSDAY,AA 3402,AA,MQ,ORD,7:59 PM,10:52 PM,,late,late,late
8,4/11/24,THURSDAY,B6 116,B6,B6,JFK,1:33 PM,2:50 PM,late,,,
9,4/11/24,THURSDAY,DL 5182,DL,9E,JFK,2:55 PM,4:21 PM,,early,early,early


In [95]:
submission_csv.drop(columns=['MKT_UNIQUE_CARRIER', 'OP_UNIQUE_CARRIER'], inplace=True)

In [96]:
submission_csv.to_csv('test_data/initial_submission_2.csv', index=False)