In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', None)

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

In [2]:
submission_csv = pd.read_csv('test_data/CIS_662 _INITIAL_Predictions.csv', keep_default_na=False)
former_flights_data = pd.read_csv('dataset/merged_data/former_flight_data.csv')
latter_flight_data = pd.read_csv('./dataset/merged_data/latter_flight_data.csv')
test_data = pd.read_csv('test_data/initial_test_data_for_prediction.csv')
latter_flight_data.head()
latter_flight_data.shape

Unnamed: 0,DAY_OF_WEEK,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,ARR_DELAY,SCH_DEP_TIME,SCH_ARR_TIME,ORGIN_WTH_temp,ORGIN_WTH_precip,ORGIN_WTH_precipprob,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,ORGIN_WTH_severerisk,DEST_WTH_temp,DEST_WTH_precip,DEST_WTH_precipprob,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DEST_WTH_severerisk,FORMER_FLIGHT_STATUS
0,6,WN,WN,MCO,-26.0,2022-01-01 10:30:00,2022-01-01 13:20:00,74.0,0.0,0,0.0,6.9,200.0,2.9,9.9,3.0,48.0,0.0,0,0.0,3.6,8.0,100.0,9.8,3.0,
1,6,UA,OO,ORD,-25.0,2022-01-01 10:40:00,2022-01-01 13:32:00,36.1,0.0,0,0.02,21.0,20.0,100.0,9.9,3.0,47.9,0.0,0,0.0,0.4,358.0,100.0,9.7,3.0,
2,6,B6,B6,MCO,22.0,2022-01-01 13:13:00,2022-01-01 15:56:00,83.0,0.0,0,0.0,9.9,199.0,4.3,9.9,3.0,47.7,0.0,0,0.0,7.9,311.0,100.0,7.8,3.0,early
3,6,B6,B6,JFK,36.0,2022-01-01 21:45:00,2022-01-01 22:59:00,52.8,0.15,100,0.0,8.1,40.0,100.0,2.2,3.0,37.9,0.02,100,0.0,6.1,303.0,100.0,6.8,3.0,
4,7,B6,B6,JFK,-12.0,2022-01-02 08:29:00,2022-01-02 09:50:00,52.1,0.0,0,0.0,0.0,0.0,100.0,5.9,3.0,25.0,0.0,0,0.01,13.8,303.0,100.0,1.2,3.0,


(6773, 26)

In [3]:
# Drop NaN for latter flight data
if True:
    latter_flight_data.dropna(subset=['FORMER_FLIGHT_STATUS'], inplace=True)

In [4]:
def delay_categories(delay):
    if delay < -7:
        return 0 #early
    elif delay > 7:
        return 2 #late
    else:
        return 1 #ontime

In [5]:
X_former_flights_data = former_flights_data.drop(columns=['ARR_DELAY'])
y_former_flights_data = former_flights_data['ARR_DELAY'].apply(delay_categories)

X_latter_flight_data = latter_flight_data.drop(columns=['ARR_DELAY'])
y_latter_flight_data = latter_flight_data['ARR_DELAY'].apply(delay_categories)

In [6]:
# Handle Categorical Variables
categorical_vars = ['DAY_OF_WEEK', 'MKT_UNIQUE_CARRIER',
                    'OP_UNIQUE_CARRIER', 'ORIGIN',
                    'ORGIN_WTH_precipprob', 'ORGIN_WTH_severerisk',
                    'DEST_WTH_precipprob', 'DEST_WTH_severerisk',
                    'FORMER_FLIGHT_STATUS', 'MONTH']

# categorical_vars = potential_categorical_vars.keys()

In [7]:
def preprocess(flight_data: pd.DataFrame):

    # Dealing with date and time
    flight_data['SCH_ARR_TIME'] = pd.to_datetime(flight_data['SCH_ARR_TIME'])
    flight_data['SCH_DEP_TIME'] = pd.to_datetime(flight_data['SCH_DEP_TIME'])

    flight_data['MONTH'] = flight_data['SCH_ARR_TIME'].dt.month
    flight_data['DAY'] = flight_data['SCH_ARR_TIME'].dt.day
    flight_data['DEP_MINUTES'] = flight_data['SCH_DEP_TIME'].dt.hour * 60 + flight_data['SCH_DEP_TIME'].dt.minute
    flight_data['ARR_MINUTES'] = flight_data['SCH_ARR_TIME'].dt.hour * 60 + flight_data['SCH_ARR_TIME'].dt.minute

    flight_data.drop(columns=['SCH_DEP_TIME', 'SCH_ARR_TIME'], inplace=True)

    # Dropping unwanted columns
    cols = [
        'ORGIN_WTH_temp', 'DEST_WTH_temp',
        'DEST_WTH_severerisk', 'ORGIN_WTH_severerisk',
        'DEST_WTH_precipprob', 'ORGIN_WTH_precipprob'
        ]
    flight_data.drop(columns=cols, inplace=True)
    
    cat_col = list(set(flight_data.columns).intersection(categorical_vars))
    flight_data = pd.get_dummies(flight_data, columns = list(cat_col), drop_first = False)

    return flight_data
    

In [8]:
latter_flight_data.head()


Unnamed: 0,DAY_OF_WEEK,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,ARR_DELAY,SCH_DEP_TIME,SCH_ARR_TIME,ORGIN_WTH_temp,ORGIN_WTH_precip,ORGIN_WTH_precipprob,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,ORGIN_WTH_severerisk,DEST_WTH_temp,DEST_WTH_precip,DEST_WTH_precipprob,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DEST_WTH_severerisk,FORMER_FLIGHT_STATUS
2,6,B6,B6,MCO,22.0,2022-01-01 13:13:00,2022-01-01 15:56:00,83.0,0.0,0,0.0,9.9,199.0,4.3,9.9,3.0,47.7,0.0,0,0.0,7.9,311.0,100.0,7.8,3.0,early
6,7,UA,OO,ORD,48.0,2022-01-02 10:40:00,2022-01-02 13:32:00,23.5,0.0,0,0.0,11.3,330.0,90.3,6.7,3.0,23.0,0.0,0,0.01,11.2,301.0,100.0,8.5,3.0,late
7,7,DL,9E,JFK,180.0,2022-01-02 12:55:00,2022-01-02 14:12:00,57.2,0.0,0,0.0,10.1,243.0,90.7,9.4,3.0,23.0,0.0,0,0.01,11.2,301.0,100.0,8.5,3.0,early
9,7,AA,MQ,ORD,35.0,2022-01-02 17:25:00,2022-01-02 20:12:00,25.7,0.0,0,0.0,6.4,329.0,24.2,9.9,3.0,24.0,0.0,0,0.01,6.8,338.0,99.9,9.9,3.0,late
10,7,UA,OO,ORD,136.0,2022-01-02 17:55:00,2022-01-02 20:52:00,24.5,0.0,0,0.0,7.7,315.0,24.2,9.9,3.0,21.7,0.0,0,0.01,6.9,343.0,98.6,8.8,3.0,late


In [9]:
# X = preprocess(X)
X_former_flights_data = preprocess(X_former_flights_data)
X_latter_flight_data = preprocess(X_latter_flight_data)
test_data = preprocess(test_data)

In [10]:
# Get missing columns in the prediction data
missing_cols = set(X_former_flights_data.columns) - set(test_data.columns)
# Add a zero column for missing columns in prediction data
for c in missing_cols:
    test_data[c] = 0

# Ensure the order of columns in prediction data matches that of flight_data_encoded
test_data = test_data[X_former_flights_data.columns]
test_data.columns
test_data.shape
# Now, prediction_data_encoded should have the same columns as flight_data_encoded

Index(['ORGIN_WTH_precip', 'ORGIN_WTH_snow', 'ORGIN_WTH_windspeed',
       'ORGIN_WTH_winddir', 'ORGIN_WTH_cloudcover', 'ORGIN_WTH_visibility',
       'DEST_WTH_precip', 'DEST_WTH_snow', 'DEST_WTH_windspeed',
       'DEST_WTH_winddir', 'DEST_WTH_cloudcover', 'DEST_WTH_visibility', 'DAY',
       'DEP_MINUTES', 'ARR_MINUTES', 'OP_UNIQUE_CARRIER_9E',
       'OP_UNIQUE_CARRIER_B6', 'OP_UNIQUE_CARRIER_G7', 'OP_UNIQUE_CARRIER_MQ',
       'OP_UNIQUE_CARRIER_OO', 'OP_UNIQUE_CARRIER_PT', 'OP_UNIQUE_CARRIER_UA',
       'OP_UNIQUE_CARRIER_WN', 'OP_UNIQUE_CARRIER_YX', 'OP_UNIQUE_CARRIER_ZW',
       'DAY_OF_WEEK_1', 'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4',
       'DAY_OF_WEEK_5', 'DAY_OF_WEEK_6', 'DAY_OF_WEEK_7', 'MONTH_1', 'MONTH_2',
       'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6', 'MONTH_7', 'MONTH_8',
       'MONTH_9', 'MONTH_10', 'MONTH_11', 'MONTH_12', 'ORIGIN_JFK',
       'ORIGIN_MCO', 'ORIGIN_ORD', 'MKT_UNIQUE_CARRIER_AA',
       'MKT_UNIQUE_CARRIER_B6', 'MKT_UNIQUE_CARRIER_DL',


(23, 52)

In [11]:
# Check if PCA using SVD gives same results
from sklearn.decomposition import PCA
# initializing the PCA transformer
pca_former = PCA(n_components = 4)
# dimensionality reduction:
data_pca_former = pd.DataFrame(pca_former.fit_transform(X_former_flights_data), index = X_former_flights_data.index)
data_pca_former.head()

Unnamed: 0,0,1,2,3
0,348.88,-93.41,-42.11,39.29
1,-241.15,-118.15,-88.77,27.7
2,-77.41,-91.47,-45.07,24.56
3,-19.48,-89.07,-43.93,23.8
4,-554.98,-2.98,246.2,49.74


In [12]:
# Check if PCA using SVD gives same results
from sklearn.decomposition import PCA
# initializing the PCA transformer
pca_latter = PCA(n_components = 4)
# dimensionality reduction:
data_pca_latter = pd.DataFrame(pca_latter.fit_transform(X_latter_flight_data), index = X_latter_flight_data.index)
data_pca_latter.head()

Unnamed: 0,0,1,2,3
2,262.41,-79.53,-51.37,36.98
6,474.01,-138.74,60.34,-43.76
7,348.34,-91.85,-8.64,-14.99
9,-90.72,-210.29,14.26,7.41
10,-139.81,-208.43,-1.75,4.57


In [13]:
# Get missing columns in the prediction data
missing_cols = set(X_former_flights_data.columns) - set(test_data.columns)
# Add a zero column for missing columns in prediction data
for c in missing_cols:
    test_data[c] = 0

# Ensure the order of columns in prediction data matches that of flight_data_encoded
test_data = test_data[X_former_flights_data.columns]
test_data.columns
test_data.shape
# Now, prediction_data_encoded should have the same columns as flight_data_encoded

Index(['ORGIN_WTH_precip', 'ORGIN_WTH_snow', 'ORGIN_WTH_windspeed',
       'ORGIN_WTH_winddir', 'ORGIN_WTH_cloudcover', 'ORGIN_WTH_visibility',
       'DEST_WTH_precip', 'DEST_WTH_snow', 'DEST_WTH_windspeed',
       'DEST_WTH_winddir', 'DEST_WTH_cloudcover', 'DEST_WTH_visibility', 'DAY',
       'DEP_MINUTES', 'ARR_MINUTES', 'OP_UNIQUE_CARRIER_9E',
       'OP_UNIQUE_CARRIER_B6', 'OP_UNIQUE_CARRIER_G7', 'OP_UNIQUE_CARRIER_MQ',
       'OP_UNIQUE_CARRIER_OO', 'OP_UNIQUE_CARRIER_PT', 'OP_UNIQUE_CARRIER_UA',
       'OP_UNIQUE_CARRIER_WN', 'OP_UNIQUE_CARRIER_YX', 'OP_UNIQUE_CARRIER_ZW',
       'DAY_OF_WEEK_1', 'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4',
       'DAY_OF_WEEK_5', 'DAY_OF_WEEK_6', 'DAY_OF_WEEK_7', 'MONTH_1', 'MONTH_2',
       'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6', 'MONTH_7', 'MONTH_8',
       'MONTH_9', 'MONTH_10', 'MONTH_11', 'MONTH_12', 'ORIGIN_JFK',
       'ORIGIN_MCO', 'ORIGIN_ORD', 'MKT_UNIQUE_CARRIER_AA',
       'MKT_UNIQUE_CARRIER_B6', 'MKT_UNIQUE_CARRIER_DL',


(23, 52)

In [14]:
former_flight_model = xgb.XGBClassifier(learning_rate=0.01, max_depth=1, min_child_weight=2, n_estimators=600, reg_lambda=0.007)

former_flight_model = former_flight_model.fit(data_pca_former, y_former_flights_data)
former_flight_model.score(data_pca_former, y_former_flights_data)
feat_imp_former = pd.Series(former_flight_model.feature_importances_, data_pca_former.columns.values).sort_values(ascending=False)
feat_imp_former.head(15)

latter_flight_model = xgb.XGBClassifier(learning_rate=0.01, max_depth=1, min_child_weight=2, n_estimators=600, reg_lambda=0.007)

latter_flight_model = latter_flight_model.fit(data_pca_latter, y_latter_flight_data)
latter_flight_model.score(data_pca_latter, y_latter_flight_data)
feat_imp_latter = pd.Series(latter_flight_model.feature_importances_, data_pca_latter.columns.values).sort_values(ascending=False)
feat_imp_latter.head(15)


0.43909641222501106

3   0.39
0   0.34
1   0.16
2   0.11
dtype: float32

0.4437869822485207

0   0.35
3   0.30
1   0.19
2   0.16
dtype: float32

In [15]:
status_dic = {0: 'early', 1: 'ontime', 2: 'late'}
for index, sub_row in submission_csv.iterrows():
    test_row = test_data.iloc[index].copy()


    # Predict Former    
    test_df = pd.DataFrame(test_row).transpose()
    former_row = pd.DataFrame(pca_former.transform(test_df), index = test_df.index)
    former = status_dic[former_flight_model.predict(former_row)[0]]
    if sub_row['ARRIVAL STATUS'] != 'NA':
        sub_row['ARRIVAL STATUS'] = former

    # Predict Latter - Former Early
    test_row['FORMER_FLIGHT_STATUS_late'] = 0
    test_row['FORMER_FLIGHT_STATUS_on-time'] = 0
    test_row['FORMER_FLIGHT_STATUS_early'] = 1
    test_df = pd.DataFrame(test_row).transpose()
    test_df = test_df[X_latter_flight_data.columns]
    early_row = pd.DataFrame(pca_latter.transform(test_df), index = test_df.index)
    early = status_dic[latter_flight_model.predict(early_row)[0]]
    if sub_row['ARRIVAL STATUS_Prev_flight_early'] != 'NA':
        sub_row['ARRIVAL STATUS_Prev_flight_early'] = early

    # Predict Latter - Former ontime
    test_row['FORMER_FLIGHT_STATUS_late'] = 0
    test_row['FORMER_FLIGHT_STATUS_on-time'] = 1
    test_row['FORMER_FLIGHT_STATUS_early'] = 0
    test_df = pd.DataFrame(test_row).transpose()
    test_df = test_df[X_latter_flight_data.columns]
    ontime_row = pd.DataFrame(pca_latter.transform(test_df), index = test_df.index)
    ontime = status_dic[latter_flight_model.predict(ontime_row)[0]]
    if sub_row['ARRIVAL STATUS_Prev_flight_ontime'] != 'NA':
        sub_row['ARRIVAL STATUS_Prev_flight_ontime'] = ontime
    
    # Predict Latter - Former late
    test_row['FORMER_FLIGHT_STATUS_late'] = 1
    test_row['FORMER_FLIGHT_STATUS_on-time'] = 0
    test_row['FORMER_FLIGHT_STATUS_early'] = 0
    test_df = pd.DataFrame(test_row).transpose()
    test_df = test_df[X_latter_flight_data.columns]
    late_row = pd.DataFrame(pca_latter.transform(test_df), index = test_df.index)
    late = status_dic[latter_flight_model.predict(late_row)[0]]
    if sub_row['ARRIVAL STATUS_Prev_flight_late'] != 'NA':
        sub_row['ARRIVAL STATUS_Prev_flight_late'] = late

submission_csv.head(24)
    

Unnamed: 0,DATE,DAY,FLIGHT NUMBER,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,4/10/24,WEDNESDAY,UA 1400,UA,UA,ORD,6:52 PM,9:47 PM,early,,,
1,4/10/24,WEDNESDAY,AA 3402,AA,MQ,ORD,7:59 PM,10:52 PM,,early,early,early
2,4/10/24,WEDNESDAY,B6 116,B6,B6,JFK,1:33 PM,2:50 PM,early,,,
3,4/10/24,WEDNESDAY,DL 5182,DL,9E,JFK,2:55 PM,4:21 PM,,early,early,early
4,4/10/24,WEDNESDAY,WN 5285,WN,WN,MCO,11:05 AM,1:45 PM,early,,,
5,4/10/24,WEDNESDAY,B6 656,B6,B6,MCO,1:35 PM,4:25 PM,,early,early,early
6,4/11/24,THURSDAY,UA 1400,UA,UA,ORD,6:52 PM,9:47 PM,early,,,
7,4/11/24,THURSDAY,AA 3402,AA,MQ,ORD,7:59 PM,10:52 PM,,early,early,early
8,4/11/24,THURSDAY,B6 116,B6,B6,JFK,1:33 PM,2:50 PM,early,,,
9,4/11/24,THURSDAY,DL 5182,DL,9E,JFK,2:55 PM,4:21 PM,,early,early,early
