In [237]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', None)

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

In [284]:
submission_csv = pd.read_csv('test_data/CIS_662 _INITIAL_Predictions.csv', keep_default_na=False)
former_flights_data = pd.read_csv('dataset/merged_data/former_flight_data.csv')
latter_flight_data = pd.read_csv('./dataset/merged_data/latter_flight_data.csv')
test_data = pd.read_csv('test_data/initial_test_data_for_prediction.csv')
latter_flight_data.head()
latter_flight_data.shape

Unnamed: 0,DAY_OF_WEEK,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,ARR_DELAY,SCH_DEP_TIME,SCH_ARR_TIME,ORGIN_WTH_temp,ORGIN_WTH_precip,ORGIN_WTH_precipprob,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,ORGIN_WTH_severerisk,DEST_WTH_temp,DEST_WTH_precip,DEST_WTH_precipprob,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DEST_WTH_severerisk,FORMER_FLIGHT_STATUS
0,6,WN,WN,MCO,-26.0,2022-01-01 10:30:00,2022-01-01 13:20:00,74.0,0.0,0,0.0,6.9,200.0,2.9,9.9,3.0,48.0,0.0,0,0.0,3.6,8.0,100.0,9.8,3.0,
1,6,UA,OO,ORD,-25.0,2022-01-01 10:40:00,2022-01-01 13:32:00,36.1,0.0,0,0.02,21.0,20.0,100.0,9.9,3.0,47.9,0.0,0,0.0,0.4,358.0,100.0,9.7,3.0,
2,6,B6,B6,MCO,22.0,2022-01-01 13:13:00,2022-01-01 15:56:00,83.0,0.0,0,0.0,9.9,199.0,4.3,9.9,3.0,47.7,0.0,0,0.0,7.9,311.0,100.0,7.8,3.0,early
3,6,B6,B6,JFK,36.0,2022-01-01 21:45:00,2022-01-01 22:59:00,52.8,0.15,100,0.0,8.1,40.0,100.0,2.2,3.0,37.9,0.02,100,0.0,6.1,303.0,100.0,6.8,3.0,
4,7,B6,B6,JFK,-12.0,2022-01-02 08:29:00,2022-01-02 09:50:00,52.1,0.0,0,0.0,0.0,0.0,100.0,5.9,3.0,25.0,0.0,0,0.01,13.8,303.0,100.0,1.2,3.0,


(6773, 26)

In [285]:
# Drop NaN for latter flight data
if True:
    latter_flight_data.dropna(subset=['FORMER_FLIGHT_STATUS'], inplace=True)

In [286]:
def categorize_delay(delay):
    if delay < -7:
        return 0 #early
    elif delay > 6:
        return 2 #late
    else:
        return 1 #ontime

In [287]:
X_former_flights_data = former_flights_data.drop(columns=['ARR_DELAY'])
y_former_flights_data = former_flights_data['ARR_DELAY'].apply(categorize_delay)

X_latter_flight_data = latter_flight_data.drop(columns=['ARR_DELAY'])
y_latter_flight_data = latter_flight_data['ARR_DELAY'].apply(categorize_delay)

In [288]:
# X = flight_data.drop(columns=['ARR_DELAY'])
# y = flight_data['ARR_DELAY']

In [289]:
# potential_categorical_vars = {}
# # For categorical variables if we are considering variables which were coded to integers, but are actually catagorical.
# if True:
#     for col in flight_data.columns:
#         unique_col_vals = flight_data[col].unique()
#         if(len(unique_col_vals) < 15):
#             potential_categorical_vars[col] = unique_col_vals
# potential_categorical_vars

In [290]:
# Handle Categorical Variables
categorical_vars = ['DAY_OF_WEEK', 'MKT_UNIQUE_CARRIER',
                    'OP_UNIQUE_CARRIER', 'ORIGIN',
                    'ORGIN_WTH_precipprob', 'ORGIN_WTH_severerisk',
                    'DEST_WTH_precipprob', 'DEST_WTH_severerisk',
                    'FORMER_FLIGHT_STATUS', 'MONTH']

# categorical_vars = potential_categorical_vars.keys()

In [291]:
def preprocess(flight_data: pd.DataFrame):

    # Dealing with date and time
    flight_data['SCH_ARR_TIME'] = pd.to_datetime(flight_data['SCH_ARR_TIME'])
    flight_data['SCH_DEP_TIME'] = pd.to_datetime(flight_data['SCH_DEP_TIME'])

    flight_data['MONTH'] = flight_data['SCH_ARR_TIME'].dt.month
    flight_data['DAY'] = flight_data['SCH_ARR_TIME'].dt.day
    flight_data['DEP_MINUTES'] = flight_data['SCH_DEP_TIME'].dt.hour * 60 + flight_data['SCH_DEP_TIME'].dt.minute
    flight_data['ARR_MINUTES'] = flight_data['SCH_ARR_TIME'].dt.hour * 60 + flight_data['SCH_ARR_TIME'].dt.minute

    flight_data.drop(columns=['SCH_DEP_TIME', 'SCH_ARR_TIME'], inplace=True)

    # Dropping unwanted columns
    cols = [
        'ORGIN_WTH_temp', 'DEST_WTH_temp',
        'DEST_WTH_severerisk', 'ORGIN_WTH_severerisk',
        'DEST_WTH_precipprob', 'ORGIN_WTH_precipprob'
        ]
    flight_data.drop(columns=cols, inplace=True)
    
    cat_col = list(set(flight_data.columns).intersection(categorical_vars))
    flight_data = pd.get_dummies(flight_data, columns = list(cat_col), drop_first = False)

    return flight_data
    

In [292]:
latter_flight_data.head()


Unnamed: 0,DAY_OF_WEEK,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,ARR_DELAY,SCH_DEP_TIME,SCH_ARR_TIME,ORGIN_WTH_temp,ORGIN_WTH_precip,ORGIN_WTH_precipprob,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,ORGIN_WTH_severerisk,DEST_WTH_temp,DEST_WTH_precip,DEST_WTH_precipprob,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DEST_WTH_severerisk,FORMER_FLIGHT_STATUS
2,6,B6,B6,MCO,22.0,2022-01-01 13:13:00,2022-01-01 15:56:00,83.0,0.0,0,0.0,9.9,199.0,4.3,9.9,3.0,47.7,0.0,0,0.0,7.9,311.0,100.0,7.8,3.0,early
6,7,UA,OO,ORD,48.0,2022-01-02 10:40:00,2022-01-02 13:32:00,23.5,0.0,0,0.0,11.3,330.0,90.3,6.7,3.0,23.0,0.0,0,0.01,11.2,301.0,100.0,8.5,3.0,late
7,7,DL,9E,JFK,180.0,2022-01-02 12:55:00,2022-01-02 14:12:00,57.2,0.0,0,0.0,10.1,243.0,90.7,9.4,3.0,23.0,0.0,0,0.01,11.2,301.0,100.0,8.5,3.0,early
9,7,AA,MQ,ORD,35.0,2022-01-02 17:25:00,2022-01-02 20:12:00,25.7,0.0,0,0.0,6.4,329.0,24.2,9.9,3.0,24.0,0.0,0,0.01,6.8,338.0,99.9,9.9,3.0,late
10,7,UA,OO,ORD,136.0,2022-01-02 17:55:00,2022-01-02 20:52:00,24.5,0.0,0,0.0,7.7,315.0,24.2,9.9,3.0,21.7,0.0,0,0.01,6.9,343.0,98.6,8.8,3.0,late


In [293]:
# X = preprocess(X)
X_former_flights_data = preprocess(X_former_flights_data)
X_latter_flight_data = preprocess(X_latter_flight_data)
test_data = preprocess(test_data)

In [294]:
# Get missing columns in the prediction data
missing_cols = set(X_former_flights_data.columns) - set(test_data.columns)
# Add a zero column for missing columns in prediction data
for c in missing_cols:
    test_data[c] = 0

# Ensure the order of columns in prediction data matches that of flight_data_encoded
test_data = test_data[X_former_flights_data.columns]
test_data.columns
test_data.shape
# Now, prediction_data_encoded should have the same columns as flight_data_encoded

Index(['ORGIN_WTH_precip', 'ORGIN_WTH_snow', 'ORGIN_WTH_windspeed',
       'ORGIN_WTH_winddir', 'ORGIN_WTH_cloudcover', 'ORGIN_WTH_visibility',
       'DEST_WTH_precip', 'DEST_WTH_snow', 'DEST_WTH_windspeed',
       'DEST_WTH_winddir', 'DEST_WTH_cloudcover', 'DEST_WTH_visibility', 'DAY',
       'DEP_MINUTES', 'ARR_MINUTES', 'OP_UNIQUE_CARRIER_9E',
       'OP_UNIQUE_CARRIER_B6', 'OP_UNIQUE_CARRIER_G7', 'OP_UNIQUE_CARRIER_MQ',
       'OP_UNIQUE_CARRIER_OO', 'OP_UNIQUE_CARRIER_PT', 'OP_UNIQUE_CARRIER_UA',
       'OP_UNIQUE_CARRIER_WN', 'OP_UNIQUE_CARRIER_YX', 'OP_UNIQUE_CARRIER_ZW',
       'MONTH_1', 'MONTH_2', 'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6',
       'MONTH_7', 'MONTH_8', 'MONTH_9', 'MONTH_10', 'MONTH_11', 'MONTH_12',
       'MKT_UNIQUE_CARRIER_AA', 'MKT_UNIQUE_CARRIER_B6',
       'MKT_UNIQUE_CARRIER_DL', 'MKT_UNIQUE_CARRIER_UA',
       'MKT_UNIQUE_CARRIER_WN', 'DAY_OF_WEEK_1', 'DAY_OF_WEEK_2',
       'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5', 'DAY_OF_WEEK_6',
       'D

(23, 52)

In [295]:
from sklearn.preprocessing import StandardScaler
sc_former = StandardScaler()
subset_data_former = pd.DataFrame(sc_former.fit_transform(X_former_flights_data[X_former_flights_data.columns]), columns = X_former_flights_data[X_former_flights_data.columns].columns, index = X_former_flights_data.index)
subset_data_former.shape
subset_data_former.columns
subset_data_former.head()

(6773, 52)

Index(['ORGIN_WTH_precip', 'ORGIN_WTH_snow', 'ORGIN_WTH_windspeed',
       'ORGIN_WTH_winddir', 'ORGIN_WTH_cloudcover', 'ORGIN_WTH_visibility',
       'DEST_WTH_precip', 'DEST_WTH_snow', 'DEST_WTH_windspeed',
       'DEST_WTH_winddir', 'DEST_WTH_cloudcover', 'DEST_WTH_visibility', 'DAY',
       'DEP_MINUTES', 'ARR_MINUTES', 'OP_UNIQUE_CARRIER_9E',
       'OP_UNIQUE_CARRIER_B6', 'OP_UNIQUE_CARRIER_G7', 'OP_UNIQUE_CARRIER_MQ',
       'OP_UNIQUE_CARRIER_OO', 'OP_UNIQUE_CARRIER_PT', 'OP_UNIQUE_CARRIER_UA',
       'OP_UNIQUE_CARRIER_WN', 'OP_UNIQUE_CARRIER_YX', 'OP_UNIQUE_CARRIER_ZW',
       'MONTH_1', 'MONTH_2', 'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6',
       'MONTH_7', 'MONTH_8', 'MONTH_9', 'MONTH_10', 'MONTH_11', 'MONTH_12',
       'MKT_UNIQUE_CARRIER_AA', 'MKT_UNIQUE_CARRIER_B6',
       'MKT_UNIQUE_CARRIER_DL', 'MKT_UNIQUE_CARRIER_UA',
       'MKT_UNIQUE_CARRIER_WN', 'DAY_OF_WEEK_1', 'DAY_OF_WEEK_2',
       'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5', 'DAY_OF_WEEK_6',
       'D

Unnamed: 0,ORGIN_WTH_precip,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,DEST_WTH_precip,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DAY,DEP_MINUTES,ARR_MINUTES,OP_UNIQUE_CARRIER_9E,OP_UNIQUE_CARRIER_B6,OP_UNIQUE_CARRIER_G7,OP_UNIQUE_CARRIER_MQ,OP_UNIQUE_CARRIER_OO,OP_UNIQUE_CARRIER_PT,OP_UNIQUE_CARRIER_UA,OP_UNIQUE_CARRIER_WN,OP_UNIQUE_CARRIER_YX,OP_UNIQUE_CARRIER_ZW,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12,MKT_UNIQUE_CARRIER_AA,MKT_UNIQUE_CARRIER_B6,MKT_UNIQUE_CARRIER_DL,MKT_UNIQUE_CARRIER_UA,MKT_UNIQUE_CARRIER_WN,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,ORIGIN_JFK,ORIGIN_MCO,ORIGIN_ORD
0,-0.14,-0.08,-0.3,0.39,-1.43,0.32,-0.14,-0.08,-0.32,1.01,0.27,0.39,-1.46,-1.0,-0.88,-0.51,-0.55,-0.18,2.43,-0.33,-0.09,-0.37,-0.16,-0.34,-0.16,3.39,-0.28,-0.31,-0.31,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,1.76,-0.55,-0.52,-0.63,-0.16,2.42,-0.4,-0.4,-0.41,-0.41,-0.42,-0.41,-0.75,-0.36,0.94
1,-0.14,-0.08,0.08,0.19,-0.81,0.32,-0.14,-0.08,-0.52,1.25,-1.65,0.39,-1.46,0.62,0.73,-0.51,-0.55,-0.18,2.43,-0.33,-0.09,-0.37,-0.16,-0.34,-0.16,3.39,-0.28,-0.31,-0.31,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,1.76,-0.55,-0.52,-0.63,-0.16,2.42,-0.4,-0.4,-0.41,-0.41,-0.42,-0.41,-0.75,-0.36,0.94
2,-0.14,-0.08,0.7,0.31,-0.81,0.32,-0.14,-0.08,-0.54,0.85,-1.03,0.39,-1.46,0.17,0.28,-0.51,-0.55,-0.18,2.43,-0.33,-0.09,-0.37,-0.16,-0.34,-0.16,3.39,-0.28,-0.31,-0.31,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,1.76,-0.55,-0.52,-0.63,-0.16,2.42,-0.4,-0.4,-0.41,-0.41,-0.42,-0.41,-0.75,-0.36,0.94
3,-0.14,-0.08,0.7,0.31,-0.81,0.32,-0.14,-0.08,-0.54,0.85,-1.03,0.39,-1.46,0.0,0.13,-0.51,-0.55,5.5,-0.41,-0.33,-0.09,-0.37,-0.16,-0.34,-0.16,3.39,-0.28,-0.31,-0.31,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,-0.57,-0.55,-0.52,1.58,-0.16,2.42,-0.4,-0.4,-0.41,-0.41,-0.42,-0.41,-0.75,-0.36,0.94
4,-0.14,-0.08,0.08,1.5,-1.25,0.32,-0.14,-0.08,-1.59,-1.86,0.63,-0.15,-1.46,1.61,1.38,-0.51,1.81,-0.18,-0.41,-0.33,-0.09,-0.37,-0.16,-0.34,-0.16,3.39,-0.28,-0.31,-0.31,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,-0.57,1.81,-0.52,-0.63,-0.16,2.42,-0.4,-0.4,-0.41,-0.41,-0.42,-0.41,1.34,-0.36,-1.06


In [296]:
from sklearn.preprocessing import StandardScaler
sc_latter = StandardScaler()
subset_data_latter = pd.DataFrame(sc_latter.fit_transform(X_latter_flight_data[X_latter_flight_data.columns]), columns = X_latter_flight_data[X_latter_flight_data.columns].columns, index = X_latter_flight_data.index)
subset_data_latter.shape
subset_data_latter.columns
subset_data_latter.head()

(4732, 55)

Index(['ORGIN_WTH_precip', 'ORGIN_WTH_snow', 'ORGIN_WTH_windspeed',
       'ORGIN_WTH_winddir', 'ORGIN_WTH_cloudcover', 'ORGIN_WTH_visibility',
       'DEST_WTH_precip', 'DEST_WTH_snow', 'DEST_WTH_windspeed',
       'DEST_WTH_winddir', 'DEST_WTH_cloudcover', 'DEST_WTH_visibility', 'DAY',
       'DEP_MINUTES', 'ARR_MINUTES', 'FORMER_FLIGHT_STATUS_early',
       'FORMER_FLIGHT_STATUS_late', 'FORMER_FLIGHT_STATUS_on-time',
       'OP_UNIQUE_CARRIER_9E', 'OP_UNIQUE_CARRIER_B6', 'OP_UNIQUE_CARRIER_G7',
       'OP_UNIQUE_CARRIER_MQ', 'OP_UNIQUE_CARRIER_OO', 'OP_UNIQUE_CARRIER_PT',
       'OP_UNIQUE_CARRIER_UA', 'OP_UNIQUE_CARRIER_WN', 'OP_UNIQUE_CARRIER_YX',
       'OP_UNIQUE_CARRIER_ZW', 'MKT_UNIQUE_CARRIER_AA',
       'MKT_UNIQUE_CARRIER_B6', 'MKT_UNIQUE_CARRIER_DL',
       'MKT_UNIQUE_CARRIER_UA', 'MKT_UNIQUE_CARRIER_WN', 'MONTH_1', 'MONTH_2',
       'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6', 'MONTH_7', 'MONTH_8',
       'MONTH_9', 'MONTH_10', 'MONTH_11', 'MONTH_12', 'DAY_OF_WEEK_1',
   

Unnamed: 0,ORGIN_WTH_precip,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,DEST_WTH_precip,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DAY,DEP_MINUTES,ARR_MINUTES,FORMER_FLIGHT_STATUS_early,FORMER_FLIGHT_STATUS_late,FORMER_FLIGHT_STATUS_on-time,OP_UNIQUE_CARRIER_9E,OP_UNIQUE_CARRIER_B6,OP_UNIQUE_CARRIER_G7,OP_UNIQUE_CARRIER_MQ,OP_UNIQUE_CARRIER_OO,OP_UNIQUE_CARRIER_PT,OP_UNIQUE_CARRIER_UA,OP_UNIQUE_CARRIER_WN,OP_UNIQUE_CARRIER_YX,OP_UNIQUE_CARRIER_ZW,MKT_UNIQUE_CARRIER_AA,MKT_UNIQUE_CARRIER_B6,MKT_UNIQUE_CARRIER_DL,MKT_UNIQUE_CARRIER_UA,MKT_UNIQUE_CARRIER_WN,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,ORIGIN_JFK,ORIGIN_MCO,ORIGIN_ORD
2,-0.14,-0.08,0.07,0.15,-1.37,0.32,-0.14,-0.07,-0.08,1.04,0.82,-0.7,-1.69,-0.86,-0.77,1.03,-0.64,-0.54,-0.51,2.13,-0.18,-0.43,-0.34,-0.08,-0.46,-0.09,-0.39,-0.07,-0.63,2.13,-0.51,-0.68,-0.09,3.47,-0.28,-0.3,-0.31,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,-0.4,-0.41,-0.41,2.4,-0.41,-0.76,5.42,-1.23
6,-0.14,-0.08,0.36,1.39,0.85,-1.73,-0.14,0.11,0.53,0.94,0.82,-0.34,-1.58,-1.55,-1.43,-0.97,1.56,-0.54,-0.51,-0.47,-0.18,-0.43,2.97,-0.08,-0.46,-0.09,-0.39,-0.07,-0.63,-0.47,-0.51,1.47,-0.09,3.47,-0.28,-0.3,-0.31,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,-0.4,-0.41,-0.41,-0.42,2.46,-0.76,-0.18,0.81
7,-0.14,-0.08,0.11,0.56,0.86,-0.0,-0.14,0.11,0.53,0.94,0.82,-0.34,-1.58,-0.94,-1.25,1.03,-0.64,-0.54,1.96,-0.47,-0.18,-0.43,-0.34,-0.08,-0.46,-0.09,-0.39,-0.07,-0.63,-0.47,1.94,-0.68,-0.09,3.47,-0.28,-0.3,-0.31,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,-0.4,-0.41,-0.41,-0.42,2.46,1.32,-0.18,-1.23
9,-0.14,-0.08,-0.68,1.38,-0.86,0.32,-0.14,0.11,-0.28,1.28,0.82,0.38,-1.58,0.27,0.41,-0.97,1.56,-0.54,-0.51,-0.47,-0.18,2.31,-0.34,-0.08,-0.46,-0.09,-0.39,-0.07,1.58,-0.47,-0.51,-0.68,-0.09,3.47,-0.28,-0.3,-0.31,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,-0.4,-0.41,-0.41,-0.42,2.46,-0.76,-0.18,0.81
10,-0.14,-0.08,-0.4,1.25,-0.86,0.32,-0.14,0.11,-0.26,1.33,0.77,-0.18,-1.58,0.41,0.59,-0.97,1.56,-0.54,-0.51,-0.47,-0.18,-0.43,2.97,-0.08,-0.46,-0.09,-0.39,-0.07,-0.63,-0.47,-0.51,1.47,-0.09,3.47,-0.28,-0.3,-0.31,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,-0.4,-0.41,-0.41,-0.42,2.46,-0.76,-0.18,0.81


In [297]:
# cols1 = [
#     "MKT_UNIQUE_CARRIER_DL",
#     "OP_UNIQUE_CARRIER_B6",
#     "ORGIN_WTH_visibility",
#     "ORGIN_WTH_precip",
#     "DEST_WTH_visibility",
#     "ORIGIN_ORD",
#     "DEP_MINUTES",
#     "ORGIN_WTH_snow",
#     "ORGIN_WTH_cloudcover",
#     "DEST_WTH_snow",
#     "OP_UNIQUE_CARRIER_OO",
#     "DEST_WTH_precip",
#     "ARR_MINUTES",
#     "DEST_WTH_winddir",
#     "OP_UNIQUE_CARRIER_MQ"
# ]

# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# subset_data = pd.DataFrame(sc.fit_transform(X[cols1]), columns = X[cols1].columns, index = X.index)
# subset_data.shape
# subset_data.columns
# subset_data.head()

In [298]:
# X.columns

In [299]:
# # Check if PCA using SVD gives same results
# from sklearn.decomposition import PCA
# # initializing the PCA transformer
# pca = PCA(n_components = 3)
# # dimensionality reduction:
# data_pca = pd.DataFrame(pca.fit_transform(subset_data), index = subset_data.index)
# data_pca.head()

In [300]:
# Check if PCA using SVD gives same results
from sklearn.decomposition import PCA
# initializing the PCA transformer
pca_former = PCA(n_components = 3)
# dimensionality reduction:
data_pca_former = pd.DataFrame(pca_former.fit_transform(subset_data_former), index = subset_data_former.index)
data_pca_former.head()

Unnamed: 0,0,1,2
0,-1.5,0.02,0.43
1,-1.65,-0.37,-1.26
2,-1.77,-0.23,-0.82
3,-1.75,0.01,0.63
4,2.53,0.8,-3.25


In [301]:
# Check if PCA using SVD gives same results
from sklearn.decomposition import PCA
# initializing the PCA transformer
pca_latter = PCA(n_components = 3)
# dimensionality reduction:
data_pca_latter = pd.DataFrame(pca_latter.fit_transform(subset_data_latter), index = subset_data_latter.index)
data_pca_latter.head()

Unnamed: 0,0,1,2
2,2.26,-0.94,4.7
6,-2.09,2.29,0.57
7,2.25,3.09,-0.8
9,-1.19,-0.11,-0.32
10,-1.26,0.31,-0.21


In [302]:
# # Plotting the principal components
# plt.figure(figsize=(10, 6))
# plt.scatter(data_pca[0], data_pca[1], color='red', alpha=0.5)
# for i, txt in enumerate(subset_data.index):
#     plt.text(data_pca.iloc[i, 0], data_pca.iloc[i, 1], txt, fontsize=8)

# plt.title('PCA Results')
# plt.xlabel('Principal Component 1')
# plt.ylabel('Principal Component 2')
# plt.grid(True)
# plt.show()

In [303]:
subset_data_former.head()

Unnamed: 0,ORGIN_WTH_precip,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,DEST_WTH_precip,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DAY,DEP_MINUTES,ARR_MINUTES,OP_UNIQUE_CARRIER_9E,OP_UNIQUE_CARRIER_B6,OP_UNIQUE_CARRIER_G7,OP_UNIQUE_CARRIER_MQ,OP_UNIQUE_CARRIER_OO,OP_UNIQUE_CARRIER_PT,OP_UNIQUE_CARRIER_UA,OP_UNIQUE_CARRIER_WN,OP_UNIQUE_CARRIER_YX,OP_UNIQUE_CARRIER_ZW,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12,MKT_UNIQUE_CARRIER_AA,MKT_UNIQUE_CARRIER_B6,MKT_UNIQUE_CARRIER_DL,MKT_UNIQUE_CARRIER_UA,MKT_UNIQUE_CARRIER_WN,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,ORIGIN_JFK,ORIGIN_MCO,ORIGIN_ORD
0,-0.14,-0.08,-0.3,0.39,-1.43,0.32,-0.14,-0.08,-0.32,1.01,0.27,0.39,-1.46,-1.0,-0.88,-0.51,-0.55,-0.18,2.43,-0.33,-0.09,-0.37,-0.16,-0.34,-0.16,3.39,-0.28,-0.31,-0.31,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,1.76,-0.55,-0.52,-0.63,-0.16,2.42,-0.4,-0.4,-0.41,-0.41,-0.42,-0.41,-0.75,-0.36,0.94
1,-0.14,-0.08,0.08,0.19,-0.81,0.32,-0.14,-0.08,-0.52,1.25,-1.65,0.39,-1.46,0.62,0.73,-0.51,-0.55,-0.18,2.43,-0.33,-0.09,-0.37,-0.16,-0.34,-0.16,3.39,-0.28,-0.31,-0.31,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,1.76,-0.55,-0.52,-0.63,-0.16,2.42,-0.4,-0.4,-0.41,-0.41,-0.42,-0.41,-0.75,-0.36,0.94
2,-0.14,-0.08,0.7,0.31,-0.81,0.32,-0.14,-0.08,-0.54,0.85,-1.03,0.39,-1.46,0.17,0.28,-0.51,-0.55,-0.18,2.43,-0.33,-0.09,-0.37,-0.16,-0.34,-0.16,3.39,-0.28,-0.31,-0.31,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,1.76,-0.55,-0.52,-0.63,-0.16,2.42,-0.4,-0.4,-0.41,-0.41,-0.42,-0.41,-0.75,-0.36,0.94
3,-0.14,-0.08,0.7,0.31,-0.81,0.32,-0.14,-0.08,-0.54,0.85,-1.03,0.39,-1.46,0.0,0.13,-0.51,-0.55,5.5,-0.41,-0.33,-0.09,-0.37,-0.16,-0.34,-0.16,3.39,-0.28,-0.31,-0.31,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,-0.57,-0.55,-0.52,1.58,-0.16,2.42,-0.4,-0.4,-0.41,-0.41,-0.42,-0.41,-0.75,-0.36,0.94
4,-0.14,-0.08,0.08,1.5,-1.25,0.32,-0.14,-0.08,-1.59,-1.86,0.63,-0.15,-1.46,1.61,1.38,-0.51,1.81,-0.18,-0.41,-0.33,-0.09,-0.37,-0.16,-0.34,-0.16,3.39,-0.28,-0.31,-0.31,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,-0.57,1.81,-0.52,-0.63,-0.16,2.42,-0.4,-0.4,-0.41,-0.41,-0.42,-0.41,1.34,-0.36,-1.06


In [304]:
subset_data_latter.head()

Unnamed: 0,ORGIN_WTH_precip,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,DEST_WTH_precip,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DAY,DEP_MINUTES,ARR_MINUTES,FORMER_FLIGHT_STATUS_early,FORMER_FLIGHT_STATUS_late,FORMER_FLIGHT_STATUS_on-time,OP_UNIQUE_CARRIER_9E,OP_UNIQUE_CARRIER_B6,OP_UNIQUE_CARRIER_G7,OP_UNIQUE_CARRIER_MQ,OP_UNIQUE_CARRIER_OO,OP_UNIQUE_CARRIER_PT,OP_UNIQUE_CARRIER_UA,OP_UNIQUE_CARRIER_WN,OP_UNIQUE_CARRIER_YX,OP_UNIQUE_CARRIER_ZW,MKT_UNIQUE_CARRIER_AA,MKT_UNIQUE_CARRIER_B6,MKT_UNIQUE_CARRIER_DL,MKT_UNIQUE_CARRIER_UA,MKT_UNIQUE_CARRIER_WN,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,ORIGIN_JFK,ORIGIN_MCO,ORIGIN_ORD
2,-0.14,-0.08,0.07,0.15,-1.37,0.32,-0.14,-0.07,-0.08,1.04,0.82,-0.7,-1.69,-0.86,-0.77,1.03,-0.64,-0.54,-0.51,2.13,-0.18,-0.43,-0.34,-0.08,-0.46,-0.09,-0.39,-0.07,-0.63,2.13,-0.51,-0.68,-0.09,3.47,-0.28,-0.3,-0.31,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,-0.4,-0.41,-0.41,2.4,-0.41,-0.76,5.42,-1.23
6,-0.14,-0.08,0.36,1.39,0.85,-1.73,-0.14,0.11,0.53,0.94,0.82,-0.34,-1.58,-1.55,-1.43,-0.97,1.56,-0.54,-0.51,-0.47,-0.18,-0.43,2.97,-0.08,-0.46,-0.09,-0.39,-0.07,-0.63,-0.47,-0.51,1.47,-0.09,3.47,-0.28,-0.3,-0.31,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,-0.4,-0.41,-0.41,-0.42,2.46,-0.76,-0.18,0.81
7,-0.14,-0.08,0.11,0.56,0.86,-0.0,-0.14,0.11,0.53,0.94,0.82,-0.34,-1.58,-0.94,-1.25,1.03,-0.64,-0.54,1.96,-0.47,-0.18,-0.43,-0.34,-0.08,-0.46,-0.09,-0.39,-0.07,-0.63,-0.47,1.94,-0.68,-0.09,3.47,-0.28,-0.3,-0.31,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,-0.4,-0.41,-0.41,-0.42,2.46,1.32,-0.18,-1.23
9,-0.14,-0.08,-0.68,1.38,-0.86,0.32,-0.14,0.11,-0.28,1.28,0.82,0.38,-1.58,0.27,0.41,-0.97,1.56,-0.54,-0.51,-0.47,-0.18,2.31,-0.34,-0.08,-0.46,-0.09,-0.39,-0.07,1.58,-0.47,-0.51,-0.68,-0.09,3.47,-0.28,-0.3,-0.31,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,-0.4,-0.41,-0.41,-0.42,2.46,-0.76,-0.18,0.81
10,-0.14,-0.08,-0.4,1.25,-0.86,0.32,-0.14,0.11,-0.26,1.33,0.77,-0.18,-1.58,0.41,0.59,-0.97,1.56,-0.54,-0.51,-0.47,-0.18,-0.43,2.97,-0.08,-0.46,-0.09,-0.39,-0.07,-0.63,-0.47,-0.51,1.47,-0.09,3.47,-0.28,-0.3,-0.31,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,-0.4,-0.41,-0.41,-0.42,2.46,-0.76,-0.18,0.81


In [305]:
# subset_data.head()

In [306]:
# subset_data.shape

In [307]:
# # own data 

# flight_data1 = pd.read_csv('./test_data/initial_test_data_for_prediction.csv')



# flight_data1.head()
# flight_data1.columns

In [308]:
# # Handle Categorical Variables
# categorical_vars1 = ['DAY_OF_WEEK', 'MKT_UNIQUE_CARRIER',
#                     'OP_UNIQUE_CARRIER', 'ORIGIN',
#                     'ORGIN_WTH_precipprob', 'ORGIN_WTH_severerisk',
#                     'DEST_WTH_precipprob', 'DEST_WTH_severerisk','MONTH']

# # categorical_vars = potential_categorical_vars.keys()

In [309]:
# def preprocess1(flight_data1: pd.DataFrame):

#     # Dealing with date and time
#     flight_data1['SCH_ARR_TIME'] = pd.to_datetime(flight_data1['SCH_ARR_TIME'])
#     flight_data1['SCH_DEP_TIME'] = pd.to_datetime(flight_data1['SCH_DEP_TIME'])

#     flight_data1['MONTH'] = flight_data1['SCH_ARR_TIME'].dt.month
#     flight_data1['DAY'] = flight_data1['SCH_ARR_TIME'].dt.day
#     flight_data1['DEP_MINUTES'] = flight_data1['SCH_DEP_TIME'].dt.hour * 60 + flight_data1['SCH_DEP_TIME'].dt.minute
#     flight_data1['ARR_MINUTES'] = flight_data1['SCH_ARR_TIME'].dt.hour * 60 + flight_data1['SCH_ARR_TIME'].dt.minute

#     flight_data1.drop(columns=['SCH_DEP_TIME', 'SCH_ARR_TIME'], inplace=True)

#     # Dropping unwanted columns
#     cols = [
#         'ORGIN_WTH_temp', 'DEST_WTH_temp',
#         'DEST_WTH_severerisk', 'ORGIN_WTH_severerisk',
#         'DEST_WTH_precipprob', 'ORGIN_WTH_precipprob'
#         ]
#     flight_data1.drop(columns=cols, inplace=True)
    
#     cat_col1 = list(set(flight_data1.columns).intersection(categorical_vars1))
#     flight_data1 = pd.get_dummies(flight_data1, columns = list(cat_col1), drop_first = False)

#     return flight_data1
    

In [310]:
# X1= preprocess1(flight_data1)

In [311]:
# # Get missing columns in the prediction data
# missing_cols = set(X.columns) - set(X1.columns)
# # Add a zero column for missing columns in prediction data
# for c in missing_cols:
#     X1[c] = 0
# # 

In [312]:
# X1.head()
# X1.shape
# X1.columns

In [313]:
# cols2 = [
#     "MKT_UNIQUE_CARRIER_DL",
#     "OP_UNIQUE_CARRIER_B6",
#     "ORGIN_WTH_visibility",
#     "ORGIN_WTH_precip",
#     "DEST_WTH_visibility",
#     "ORIGIN_ORD",
#     "DEP_MINUTES",
#     "ORGIN_WTH_snow",
#     "ORGIN_WTH_cloudcover",
#     "DEST_WTH_snow",
#     "OP_UNIQUE_CARRIER_OO",
#     "DEST_WTH_precip",
#     "ARR_MINUTES",
#     "DEST_WTH_winddir",
#     "OP_UNIQUE_CARRIER_MQ"
# ]

# from sklearn.preprocessing import StandardScaler
# # sc = StandardScaler()
# subset_data1 = pd.DataFrame(sc.transform(X1[cols2]), columns = X1[cols2].columns, index = X1.index)
# subset_data1.shape
# subset_data1.columns
# subset_data1.head()

In [315]:
test_data_former = test_data

In [316]:
# Get missing columns in the prediction data
missing_cols = set(X_former_flights_data.columns) - set(test_data_former.columns)
# Add a zero column for missing columns in prediction data
for c in missing_cols:    
    test_data_former[c] = 0

test_data_former = test_data_former[X_former_flights_data.columns]
test_data_former.columns
test_data_former.shape

test_data_former = pd.DataFrame(sc_former.transform(test_data_former[test_data_former.columns]), columns = test_data_former[test_data_former.columns].columns, index = test_data_former.index)
test_data_former.shape
test_data_former.columns
test_data_former.head()


# Get missing columns in the prediction data
missing_cols = set(X_latter_flight_data.columns) - set(test_data.columns)
# Add a zero column for missing columns in prediction data
for c in missing_cols:    
    test_data[c] = 0

test_data = test_data[X_latter_flight_data.columns]
test_data.columns
test_data.shape

test_data_latter = pd.DataFrame(sc_latter.transform(test_data[test_data.columns]), columns = test_data[test_data.columns].columns, index = test_data.index)
test_data_latter.shape
test_data_latter.columns
test_data_latter.head()

Index(['ORGIN_WTH_precip', 'ORGIN_WTH_snow', 'ORGIN_WTH_windspeed',
       'ORGIN_WTH_winddir', 'ORGIN_WTH_cloudcover', 'ORGIN_WTH_visibility',
       'DEST_WTH_precip', 'DEST_WTH_snow', 'DEST_WTH_windspeed',
       'DEST_WTH_winddir', 'DEST_WTH_cloudcover', 'DEST_WTH_visibility', 'DAY',
       'DEP_MINUTES', 'ARR_MINUTES', 'OP_UNIQUE_CARRIER_9E',
       'OP_UNIQUE_CARRIER_B6', 'OP_UNIQUE_CARRIER_G7', 'OP_UNIQUE_CARRIER_MQ',
       'OP_UNIQUE_CARRIER_OO', 'OP_UNIQUE_CARRIER_PT', 'OP_UNIQUE_CARRIER_UA',
       'OP_UNIQUE_CARRIER_WN', 'OP_UNIQUE_CARRIER_YX', 'OP_UNIQUE_CARRIER_ZW',
       'MONTH_1', 'MONTH_2', 'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6',
       'MONTH_7', 'MONTH_8', 'MONTH_9', 'MONTH_10', 'MONTH_11', 'MONTH_12',
       'MKT_UNIQUE_CARRIER_AA', 'MKT_UNIQUE_CARRIER_B6',
       'MKT_UNIQUE_CARRIER_DL', 'MKT_UNIQUE_CARRIER_UA',
       'MKT_UNIQUE_CARRIER_WN', 'DAY_OF_WEEK_1', 'DAY_OF_WEEK_2',
       'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5', 'DAY_OF_WEEK_6',
       'D

(23, 52)

(23, 52)

Index(['ORGIN_WTH_precip', 'ORGIN_WTH_snow', 'ORGIN_WTH_windspeed',
       'ORGIN_WTH_winddir', 'ORGIN_WTH_cloudcover', 'ORGIN_WTH_visibility',
       'DEST_WTH_precip', 'DEST_WTH_snow', 'DEST_WTH_windspeed',
       'DEST_WTH_winddir', 'DEST_WTH_cloudcover', 'DEST_WTH_visibility', 'DAY',
       'DEP_MINUTES', 'ARR_MINUTES', 'OP_UNIQUE_CARRIER_9E',
       'OP_UNIQUE_CARRIER_B6', 'OP_UNIQUE_CARRIER_G7', 'OP_UNIQUE_CARRIER_MQ',
       'OP_UNIQUE_CARRIER_OO', 'OP_UNIQUE_CARRIER_PT', 'OP_UNIQUE_CARRIER_UA',
       'OP_UNIQUE_CARRIER_WN', 'OP_UNIQUE_CARRIER_YX', 'OP_UNIQUE_CARRIER_ZW',
       'MONTH_1', 'MONTH_2', 'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6',
       'MONTH_7', 'MONTH_8', 'MONTH_9', 'MONTH_10', 'MONTH_11', 'MONTH_12',
       'MKT_UNIQUE_CARRIER_AA', 'MKT_UNIQUE_CARRIER_B6',
       'MKT_UNIQUE_CARRIER_DL', 'MKT_UNIQUE_CARRIER_UA',
       'MKT_UNIQUE_CARRIER_WN', 'DAY_OF_WEEK_1', 'DAY_OF_WEEK_2',
       'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4', 'DAY_OF_WEEK_5', 'DAY_OF_WEEK_6',
       'D

Unnamed: 0,ORGIN_WTH_precip,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,DEST_WTH_precip,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DAY,DEP_MINUTES,ARR_MINUTES,OP_UNIQUE_CARRIER_9E,OP_UNIQUE_CARRIER_B6,OP_UNIQUE_CARRIER_G7,OP_UNIQUE_CARRIER_MQ,OP_UNIQUE_CARRIER_OO,OP_UNIQUE_CARRIER_PT,OP_UNIQUE_CARRIER_UA,OP_UNIQUE_CARRIER_WN,OP_UNIQUE_CARRIER_YX,OP_UNIQUE_CARRIER_ZW,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12,MKT_UNIQUE_CARRIER_AA,MKT_UNIQUE_CARRIER_B6,MKT_UNIQUE_CARRIER_DL,MKT_UNIQUE_CARRIER_UA,MKT_UNIQUE_CARRIER_WN,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,ORIGIN_JFK,ORIGIN_MCO,ORIGIN_ORD
0,-0.14,-0.08,-0.01,-1.07,0.62,0.45,-0.14,-0.08,-0.43,-0.68,0.13,-0.45,-0.66,0.95,1.1,-0.51,-0.55,-0.18,-0.41,-0.33,-0.09,2.69,-0.16,-0.34,-0.16,-0.3,-0.28,-0.31,3.25,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,-0.57,-0.55,-0.52,1.58,-0.16,-0.41,-0.4,2.52,-0.41,-0.41,-0.42,-0.41,-0.75,-0.36,0.94
1,-0.14,-0.08,-0.11,-1.25,0.66,0.45,-0.14,-0.08,-0.43,-0.65,0.18,-0.9,-0.66,1.21,1.35,-0.51,-0.55,-0.18,2.43,-0.33,-0.09,-0.37,-0.16,-0.34,-0.16,-0.3,-0.28,-0.31,3.25,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,1.76,-0.55,-0.52,-0.63,-0.16,-0.41,-0.4,2.52,-0.41,-0.41,-0.42,-0.41,-0.75,-0.36,0.94
2,1.31,-0.08,-0.01,-0.69,0.88,-3.58,-0.14,-0.08,0.22,0.77,-0.21,0.34,-0.66,-0.27,-0.51,-0.51,1.81,-0.18,-0.41,-0.33,-0.09,-0.37,-0.16,-0.34,-0.16,-0.3,-0.28,-0.31,3.25,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,-0.57,1.81,-0.52,-0.63,-0.16,-0.41,-0.4,2.52,-0.41,-0.41,-0.42,-0.41,1.34,-0.36,-1.06
3,-0.14,-0.08,-0.01,-0.79,0.84,-3.83,-0.14,-0.08,0.18,0.99,-0.36,0.15,-0.66,0.04,-0.16,1.95,-0.55,-0.18,-0.41,-0.33,-0.09,-0.37,-0.16,-0.34,-0.16,-0.3,-0.28,-0.31,3.25,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,-0.57,-0.55,1.93,-0.63,-0.16,-0.41,-0.4,2.52,-0.41,-0.41,-0.42,-0.41,1.34,-0.36,-1.06
4,-0.14,-0.08,1.24,-0.32,-0.33,0.45,1.26,-0.08,0.31,0.56,-0.06,0.49,-0.66,-0.84,-0.77,-0.51,-0.55,-0.18,-0.41,-0.33,-0.09,-0.37,6.09,-0.34,-0.16,-0.3,-0.28,-0.31,3.25,-0.32,-0.28,-0.27,-0.27,-0.31,-0.34,-0.32,-0.31,-0.57,-0.55,-0.52,-0.63,6.09,-0.41,-0.4,2.52,-0.41,-0.41,-0.42,-0.41,-0.75,2.81,-1.06


Index(['ORGIN_WTH_precip', 'ORGIN_WTH_snow', 'ORGIN_WTH_windspeed',
       'ORGIN_WTH_winddir', 'ORGIN_WTH_cloudcover', 'ORGIN_WTH_visibility',
       'DEST_WTH_precip', 'DEST_WTH_snow', 'DEST_WTH_windspeed',
       'DEST_WTH_winddir', 'DEST_WTH_cloudcover', 'DEST_WTH_visibility', 'DAY',
       'DEP_MINUTES', 'ARR_MINUTES', 'FORMER_FLIGHT_STATUS_early',
       'FORMER_FLIGHT_STATUS_late', 'FORMER_FLIGHT_STATUS_on-time',
       'OP_UNIQUE_CARRIER_9E', 'OP_UNIQUE_CARRIER_B6', 'OP_UNIQUE_CARRIER_G7',
       'OP_UNIQUE_CARRIER_MQ', 'OP_UNIQUE_CARRIER_OO', 'OP_UNIQUE_CARRIER_PT',
       'OP_UNIQUE_CARRIER_UA', 'OP_UNIQUE_CARRIER_WN', 'OP_UNIQUE_CARRIER_YX',
       'OP_UNIQUE_CARRIER_ZW', 'MKT_UNIQUE_CARRIER_AA',
       'MKT_UNIQUE_CARRIER_B6', 'MKT_UNIQUE_CARRIER_DL',
       'MKT_UNIQUE_CARRIER_UA', 'MKT_UNIQUE_CARRIER_WN', 'MONTH_1', 'MONTH_2',
       'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6', 'MONTH_7', 'MONTH_8',
       'MONTH_9', 'MONTH_10', 'MONTH_11', 'MONTH_12', 'DAY_OF_WEEK_1',
   

(23, 55)

(23, 55)

Index(['ORGIN_WTH_precip', 'ORGIN_WTH_snow', 'ORGIN_WTH_windspeed',
       'ORGIN_WTH_winddir', 'ORGIN_WTH_cloudcover', 'ORGIN_WTH_visibility',
       'DEST_WTH_precip', 'DEST_WTH_snow', 'DEST_WTH_windspeed',
       'DEST_WTH_winddir', 'DEST_WTH_cloudcover', 'DEST_WTH_visibility', 'DAY',
       'DEP_MINUTES', 'ARR_MINUTES', 'FORMER_FLIGHT_STATUS_early',
       'FORMER_FLIGHT_STATUS_late', 'FORMER_FLIGHT_STATUS_on-time',
       'OP_UNIQUE_CARRIER_9E', 'OP_UNIQUE_CARRIER_B6', 'OP_UNIQUE_CARRIER_G7',
       'OP_UNIQUE_CARRIER_MQ', 'OP_UNIQUE_CARRIER_OO', 'OP_UNIQUE_CARRIER_PT',
       'OP_UNIQUE_CARRIER_UA', 'OP_UNIQUE_CARRIER_WN', 'OP_UNIQUE_CARRIER_YX',
       'OP_UNIQUE_CARRIER_ZW', 'MKT_UNIQUE_CARRIER_AA',
       'MKT_UNIQUE_CARRIER_B6', 'MKT_UNIQUE_CARRIER_DL',
       'MKT_UNIQUE_CARRIER_UA', 'MKT_UNIQUE_CARRIER_WN', 'MONTH_1', 'MONTH_2',
       'MONTH_3', 'MONTH_4', 'MONTH_5', 'MONTH_6', 'MONTH_7', 'MONTH_8',
       'MONTH_9', 'MONTH_10', 'MONTH_11', 'MONTH_12', 'DAY_OF_WEEK_1',
   

Unnamed: 0,ORGIN_WTH_precip,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,DEST_WTH_precip,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DAY,DEP_MINUTES,ARR_MINUTES,FORMER_FLIGHT_STATUS_early,FORMER_FLIGHT_STATUS_late,FORMER_FLIGHT_STATUS_on-time,OP_UNIQUE_CARRIER_9E,OP_UNIQUE_CARRIER_B6,OP_UNIQUE_CARRIER_G7,OP_UNIQUE_CARRIER_MQ,OP_UNIQUE_CARRIER_OO,OP_UNIQUE_CARRIER_PT,OP_UNIQUE_CARRIER_UA,OP_UNIQUE_CARRIER_WN,OP_UNIQUE_CARRIER_YX,OP_UNIQUE_CARRIER_ZW,MKT_UNIQUE_CARRIER_AA,MKT_UNIQUE_CARRIER_B6,MKT_UNIQUE_CARRIER_DL,MKT_UNIQUE_CARRIER_UA,MKT_UNIQUE_CARRIER_WN,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,ORIGIN_JFK,ORIGIN_MCO,ORIGIN_ORD
0,-0.14,-0.08,-0.08,-1.08,0.58,0.44,-0.14,-0.07,-0.37,-0.64,0.15,-0.5,-0.66,0.66,0.84,-0.97,-0.64,-0.54,-0.51,-0.47,-0.18,-0.43,-0.34,-0.08,2.19,-0.09,-0.39,-0.07,-0.63,-0.47,-0.51,1.47,-0.09,-0.29,-0.28,-0.3,3.26,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,2.52,-0.41,-0.41,-0.42,-0.41,-0.76,-0.18,0.81
1,-0.14,-0.08,-0.19,-1.27,0.62,0.44,-0.14,-0.07,-0.37,-0.61,0.21,-0.96,-0.66,0.96,1.14,-0.97,-0.64,-0.54,-0.51,-0.47,-0.18,2.31,-0.34,-0.08,-0.46,-0.09,-0.39,-0.07,1.58,-0.47,-0.51,-0.68,-0.09,-0.29,-0.28,-0.3,3.26,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,2.52,-0.41,-0.41,-0.42,-0.41,-0.76,-0.18,0.81
2,1.34,-0.08,-0.08,-0.7,0.84,-3.65,-0.14,-0.07,0.27,0.78,-0.18,0.33,-0.66,-0.77,-1.07,-0.97,-0.64,-0.54,-0.51,2.13,-0.18,-0.43,-0.34,-0.08,-0.46,-0.09,-0.39,-0.07,-0.63,2.13,-0.51,-0.68,-0.09,-0.29,-0.28,-0.3,3.26,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,2.52,-0.41,-0.41,-0.42,-0.41,1.32,-0.18,-1.23
3,-0.14,-0.08,-0.08,-0.79,0.8,-3.91,-0.14,-0.07,0.24,1.0,-0.33,0.13,-0.66,-0.4,-0.65,-0.97,-0.64,-0.54,1.96,-0.47,-0.18,-0.43,-0.34,-0.08,-0.46,-0.09,-0.39,-0.07,-0.63,-0.47,1.94,-0.68,-0.09,-0.29,-0.28,-0.3,3.26,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,2.52,-0.41,-0.41,-0.42,-0.41,1.32,-0.18,-1.23
4,-0.14,-0.08,1.15,-0.32,-0.37,0.44,1.14,-0.07,0.37,0.57,-0.03,0.49,-0.66,-1.43,-1.37,-0.97,-0.64,-0.54,-0.51,-0.47,-0.18,-0.43,-0.34,-0.08,-0.46,11.58,-0.39,-0.07,-0.63,-0.47,-0.51,-0.68,11.58,-0.29,-0.28,-0.3,3.26,-0.32,-0.28,-0.26,-0.27,-0.31,-0.35,-0.33,-0.31,-0.41,-0.4,2.52,-0.41,-0.41,-0.42,-0.41,-0.76,5.42,-1.23


In [317]:
our_pca_former = pd.DataFrame(pca_former.transform(test_data_former), index = test_data_former.index)
our_pca_former.head()
our_pca_latter = pd.DataFrame(pca_latter.transform(test_data_latter), index = test_data_latter.index)
our_pca_latter.head()

Unnamed: 0,0,1,2
0,-2.02,-0.54,-1.32
1,-1.98,-0.54,-1.62
2,1.9,1.01,-0.53
3,2.02,-2.59,1.31
4,0.82,5.25,4.41


Unnamed: 0,0,1,2
0,-1.62,-1.11,-0.94
1,-1.55,-1.06,-0.78
2,2.08,-0.12,1.23
3,2.19,2.75,-1.48
4,0.5,3.6,13.16


In [318]:
# def categorize_delay(delay):
#     if delay < -5:
#         return 0 #early
#     elif delay > 5:
#         return 2 #late
#     else:
#         return 1 #ontime

In [319]:
# y_latter_flight_data = y.apply(categorize_delay)

In [320]:
# latter_flight_model = xgb.XGBClassifier(learning_rate=0.01, max_depth=1, min_child_weight=2, n_estimators=600, reg_lambda=0.007)

# latter_flight_model = latter_flight_model.fit(data_pca, y_latter_flight_data)
# latter_flight_model.score(data_pca, y_latter_flight_data)
# feat_imp_latter = pd.Series(latter_flight_model.feature_importances_, data_pca.columns.values).sort_values(ascending=False)
# feat_imp_latter.head(15)


In [321]:
# data_pca.shape

In [322]:
former_flight_model = xgb.XGBClassifier(learning_rate=0.01, max_depth=1, min_child_weight=2, n_estimators=600, reg_lambda=0.007)

former_flight_model = former_flight_model.fit(data_pca_former, y_former_flights_data)
former_flight_model.score(data_pca_former, y_former_flights_data)
feat_imp_former = pd.Series(former_flight_model.feature_importances_, data_pca_former.columns.values).sort_values(ascending=False)
feat_imp_former.head(15)

latter_flight_model = xgb.XGBClassifier(learning_rate=0.01, max_depth=1, min_child_weight=2, n_estimators=600, reg_lambda=0.007)

latter_flight_model = latter_flight_model.fit(data_pca_latter, y_latter_flight_data)
latter_flight_model.score(data_pca_latter, y_latter_flight_data)
feat_imp_latter = pd.Series(latter_flight_model.feature_importances_, data_pca_latter.columns.values).sort_values(ascending=False)
feat_imp_latter.head(15)


0.4547467887199173

1   0.59
2   0.24
0   0.17
dtype: float32

0.44146238377007607

1   0.52
2   0.25
0   0.22
dtype: float32

In [326]:
status_dic = {0: 'early', 1: 'ontime', 2: 'late'}
for index, sub_row in submission_csv.iterrows():
    test_row_former = our_pca_former.iloc[index].copy()
    test_row_latter = our_pca_latter.iloc[index].copy()

    # Predict Former
    former = status_dic[former_flight_model.predict([test_row_former])[0]]
    if sub_row['ARRIVAL STATUS'] != 'NA':
        sub_row['ARRIVAL STATUS'] = former

    # Predict Latter - Former Early
    # test_row_latter['FORMER_FLIGHT_STATUS_late'] = 0
    # test_row_latter['FORMER_FLIGHT_STATUS_on-time'] = 0
    # test_row_latter['FORMER_FLIGHT_STATUS_early'] = 1
    early = status_dic[latter_flight_model.predict([test_row_latter])[0]]
    if sub_row['ARRIVAL STATUS_Prev_flight_early'] != 'NA':
        sub_row['ARRIVAL STATUS_Prev_flight_early'] = early

    # # Predict Latter - Former ontime
    # test_row_latter['FORMER_FLIGHT_STATUS_late'] = 0
    # test_row_latter['FORMER_FLIGHT_STATUS_on-time'] = 1
    # test_row_latter['FORMER_FLIGHT_STATUS_early'] = 0
    ontime = status_dic[latter_flight_model.predict([test_row_latter])[0]]
    if sub_row['ARRIVAL STATUS_Prev_flight_ontime'] != 'NA':
        sub_row['ARRIVAL STATUS_Prev_flight_ontime'] = ontime
    
    # # Predict Latter - Former late
    # test_row_latter['FORMER_FLIGHT_STATUS_late'] = 1
    # test_row_latter['FORMER_FLIGHT_STATUS_on-time'] = 0
    # test_row_latter['FORMER_FLIGHT_STATUS_early'] = 0
    late = status_dic[latter_flight_model.predict([test_row_latter])[0]]
    if sub_row['ARRIVAL STATUS_Prev_flight_late'] != 'NA':
        sub_row['ARRIVAL STATUS_Prev_flight_late'] = late

submission_csv.head(24)
    

Unnamed: 0,DATE,DAY,FLIGHT NUMBER,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late
0,4/10/24,WEDNESDAY,UA 1400,UA,UA,ORD,6:52 PM,9:47 PM,early,,,
1,4/10/24,WEDNESDAY,AA 3402,AA,MQ,ORD,7:59 PM,10:52 PM,,early,early,early
2,4/10/24,WEDNESDAY,B6 116,B6,B6,JFK,1:33 PM,2:50 PM,late,,,
3,4/10/24,WEDNESDAY,DL 5182,DL,9E,JFK,2:55 PM,4:21 PM,,early,early,early
4,4/10/24,WEDNESDAY,WN 5285,WN,WN,MCO,11:05 AM,1:45 PM,late,,,
5,4/10/24,WEDNESDAY,B6 656,B6,B6,MCO,1:35 PM,4:25 PM,,early,early,early
6,4/11/24,THURSDAY,UA 1400,UA,UA,ORD,6:52 PM,9:47 PM,early,,,
7,4/11/24,THURSDAY,AA 3402,AA,MQ,ORD,7:59 PM,10:52 PM,,early,early,early
8,4/11/24,THURSDAY,B6 116,B6,B6,JFK,1:33 PM,2:50 PM,late,,,
9,4/11/24,THURSDAY,DL 5182,DL,9E,JFK,2:55 PM,4:21 PM,,early,early,early
