In [36]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', None)

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

In [37]:
flight_data = pd.read_csv('./dataset/merged_data/latter_flight_data.csv')
flight_data['SCH_ARR_TIME'] = pd.to_datetime(flight_data['SCH_ARR_TIME'])
flight_data['SCH_DEP_TIME'] = pd.to_datetime(flight_data['SCH_DEP_TIME'])

flight_data['MONTH'] = flight_data['SCH_ARR_TIME'].dt.month
flight_data['DAY'] = flight_data['SCH_ARR_TIME'].dt.day
flight_data['DEP_MINUTES'] = flight_data['SCH_DEP_TIME'].dt.hour * 60 + flight_data['SCH_DEP_TIME'].dt.minute
flight_data['ARR_MINUTES'] = flight_data['SCH_ARR_TIME'].dt.hour * 60 + flight_data['SCH_ARR_TIME'].dt.minute

flight_data.drop(columns=['SCH_DEP_TIME', 'SCH_ARR_TIME'], inplace=True)

flight_data.head()

flight_data.dtypes



Unnamed: 0,DAY_OF_WEEK,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,ARR_DELAY,ORGIN_WTH_temp,ORGIN_WTH_precip,ORGIN_WTH_precipprob,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,ORGIN_WTH_severerisk,DEST_WTH_temp,DEST_WTH_precip,DEST_WTH_precipprob,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DEST_WTH_severerisk,FORMER_FLIGHT_STATUS,MONTH,DAY,DEP_MINUTES,ARR_MINUTES
0,6,WN,WN,MCO,-26.0,74.0,0.0,0,0.0,6.9,200.0,2.9,9.9,3.0,48.0,0.0,0,0.0,3.6,8.0,100.0,9.8,3.0,,1,1,630,800
1,6,UA,OO,ORD,-25.0,36.1,0.0,0,0.02,21.0,20.0,100.0,9.9,3.0,47.9,0.0,0,0.0,0.4,358.0,100.0,9.7,3.0,,1,1,640,812
2,6,B6,B6,MCO,22.0,83.0,0.0,0,0.0,9.9,199.0,4.3,9.9,3.0,47.7,0.0,0,0.0,7.9,311.0,100.0,7.8,3.0,early,1,1,793,956
3,6,B6,B6,JFK,36.0,52.8,0.15,100,0.0,8.1,40.0,100.0,2.2,3.0,37.9,0.02,100,0.0,6.1,303.0,100.0,6.8,3.0,,1,1,1305,1379
4,7,B6,B6,JFK,-12.0,52.1,0.0,0,0.0,0.0,0.0,100.0,5.9,3.0,25.0,0.0,0,0.01,13.8,303.0,100.0,1.2,3.0,,1,2,509,590


DAY_OF_WEEK               int64
MKT_UNIQUE_CARRIER       object
OP_UNIQUE_CARRIER        object
ORIGIN                   object
ARR_DELAY               float64
ORGIN_WTH_temp          float64
ORGIN_WTH_precip        float64
ORGIN_WTH_precipprob      int64
ORGIN_WTH_snow          float64
ORGIN_WTH_windspeed     float64
ORGIN_WTH_winddir       float64
ORGIN_WTH_cloudcover    float64
ORGIN_WTH_visibility    float64
ORGIN_WTH_severerisk    float64
DEST_WTH_temp           float64
DEST_WTH_precip         float64
DEST_WTH_precipprob       int64
DEST_WTH_snow           float64
DEST_WTH_windspeed      float64
DEST_WTH_winddir        float64
DEST_WTH_cloudcover     float64
DEST_WTH_visibility     float64
DEST_WTH_severerisk     float64
FORMER_FLIGHT_STATUS     object
MONTH                     int32
DAY                       int32
DEP_MINUTES               int32
ARR_MINUTES               int32
dtype: object

In [38]:
categorical_vars = {}
# For categorical variables if we are considering variables which were coded to integers, but are actually catagorical.
if True:
    for col in flight_data.columns:
        unique_col_vals = flight_data[col].unique()
        if(len(unique_col_vals) < 15):
            categorical_vars[col] = unique_col_vals
categorical_vars

{'DAY_OF_WEEK': array([6, 7, 1, 2, 3, 4, 5]),
 'MKT_UNIQUE_CARRIER': array(['WN', 'UA', 'B6', 'AA', 'DL'], dtype=object),
 'OP_UNIQUE_CARRIER': array(['WN', 'OO', 'B6', 'MQ', '9E', 'G7', 'PT', 'YX', 'UA', 'ZW'],
       dtype=object),
 'ORIGIN': array(['MCO', 'ORD', 'JFK'], dtype=object),
 'ORGIN_WTH_precipprob': array([  0, 100]),
 'ORGIN_WTH_severerisk': array([  3.,  10.,  30.,  60.,   5.,  75.,  15.,   8., 100.,  50.,  19.,
         25.,  38.]),
 'DEST_WTH_precipprob': array([  0, 100]),
 'DEST_WTH_severerisk': array([  3.,  10.,  30.,  60.,   5.,  75., 100.,   8.,  15.]),
 'FORMER_FLIGHT_STATUS': array([nan, 'early', 'late', 'on-time'], dtype=object),
 'MONTH': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int32)}

In [39]:
flight_data_encoded = pd.get_dummies(flight_data, columns = list(categorical_vars.keys()), drop_first = True)
flight_data_encoded.columns
flight_data_encoded.shape

Index(['ARR_DELAY', 'ORGIN_WTH_temp', 'ORGIN_WTH_precip', 'ORGIN_WTH_snow',
       'ORGIN_WTH_windspeed', 'ORGIN_WTH_winddir', 'ORGIN_WTH_cloudcover',
       'ORGIN_WTH_visibility', 'DEST_WTH_temp', 'DEST_WTH_precip',
       'DEST_WTH_snow', 'DEST_WTH_windspeed', 'DEST_WTH_winddir',
       'DEST_WTH_cloudcover', 'DEST_WTH_visibility', 'DAY', 'DEP_MINUTES',
       'ARR_MINUTES', 'DAY_OF_WEEK_2', 'DAY_OF_WEEK_3', 'DAY_OF_WEEK_4',
       'DAY_OF_WEEK_5', 'DAY_OF_WEEK_6', 'DAY_OF_WEEK_7',
       'MKT_UNIQUE_CARRIER_B6', 'MKT_UNIQUE_CARRIER_DL',
       'MKT_UNIQUE_CARRIER_UA', 'MKT_UNIQUE_CARRIER_WN',
       'OP_UNIQUE_CARRIER_B6', 'OP_UNIQUE_CARRIER_G7', 'OP_UNIQUE_CARRIER_MQ',
       'OP_UNIQUE_CARRIER_OO', 'OP_UNIQUE_CARRIER_PT', 'OP_UNIQUE_CARRIER_UA',
       'OP_UNIQUE_CARRIER_WN', 'OP_UNIQUE_CARRIER_YX', 'OP_UNIQUE_CARRIER_ZW',
       'ORIGIN_MCO', 'ORIGIN_ORD', 'ORGIN_WTH_precipprob_100',
       'ORGIN_WTH_severerisk_5.0', 'ORGIN_WTH_severerisk_8.0',
       'ORGIN_WTH_severerisk_10.0

(6773, 74)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(flight_data_encoded.drop(columns = ['ARR_DELAY']), flight_data_encoded['ARR_DELAY'], test_size=0.20, random_state = 35)

X_train.head()
X_test.head()
y_train.head()
y_test.head()


Unnamed: 0,ORGIN_WTH_temp,ORGIN_WTH_precip,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,DEST_WTH_temp,DEST_WTH_precip,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DAY,DEP_MINUTES,ARR_MINUTES,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,MKT_UNIQUE_CARRIER_B6,MKT_UNIQUE_CARRIER_DL,MKT_UNIQUE_CARRIER_UA,MKT_UNIQUE_CARRIER_WN,OP_UNIQUE_CARRIER_B6,OP_UNIQUE_CARRIER_G7,OP_UNIQUE_CARRIER_MQ,OP_UNIQUE_CARRIER_OO,OP_UNIQUE_CARRIER_PT,OP_UNIQUE_CARRIER_UA,OP_UNIQUE_CARRIER_WN,OP_UNIQUE_CARRIER_YX,OP_UNIQUE_CARRIER_ZW,ORIGIN_MCO,ORIGIN_ORD,ORGIN_WTH_precipprob_100,ORGIN_WTH_severerisk_5.0,ORGIN_WTH_severerisk_8.0,ORGIN_WTH_severerisk_10.0,ORGIN_WTH_severerisk_15.0,ORGIN_WTH_severerisk_19.0,ORGIN_WTH_severerisk_25.0,ORGIN_WTH_severerisk_30.0,ORGIN_WTH_severerisk_38.0,ORGIN_WTH_severerisk_50.0,ORGIN_WTH_severerisk_60.0,ORGIN_WTH_severerisk_75.0,ORGIN_WTH_severerisk_100.0,DEST_WTH_precipprob_100,DEST_WTH_severerisk_5.0,DEST_WTH_severerisk_8.0,DEST_WTH_severerisk_10.0,DEST_WTH_severerisk_15.0,DEST_WTH_severerisk_30.0,DEST_WTH_severerisk_60.0,DEST_WTH_severerisk_75.0,DEST_WTH_severerisk_100.0,FORMER_FLIGHT_STATUS_late,FORMER_FLIGHT_STATUS_on-time,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12
3427,48.1,0.0,0.0,5.3,141.0,100.0,9.5,32.0,0.0,0.0,12.4,71.0,99.8,9.9,11,1104,1277,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
5107,72.0,0.0,0.0,5.9,245.0,24.2,9.9,67.0,0.16,0.0,9.1,260.0,100.0,1.7,10,494,672,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False
5413,71.7,0.0,0.0,8.5,69.0,100.0,9.9,65.7,0.0,0.0,0.1,13.0,26.6,9.9,13,1265,1439,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False
4145,44.1,0.0,0.0,16.8,53.0,100.0,9.9,35.9,0.04,0.0,3.6,100.0,100.0,5.0,25,590,674,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
2160,69.1,0.0,0.0,3.7,66.0,60.4,9.9,56.9,0.0,0.0,4.4,107.0,88.6,9.9,16,1350,1439,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False


Unnamed: 0,ORGIN_WTH_temp,ORGIN_WTH_precip,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,DEST_WTH_temp,DEST_WTH_precip,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DAY,DEP_MINUTES,ARR_MINUTES,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,MKT_UNIQUE_CARRIER_B6,MKT_UNIQUE_CARRIER_DL,MKT_UNIQUE_CARRIER_UA,MKT_UNIQUE_CARRIER_WN,OP_UNIQUE_CARRIER_B6,OP_UNIQUE_CARRIER_G7,OP_UNIQUE_CARRIER_MQ,OP_UNIQUE_CARRIER_OO,OP_UNIQUE_CARRIER_PT,OP_UNIQUE_CARRIER_UA,OP_UNIQUE_CARRIER_WN,OP_UNIQUE_CARRIER_YX,OP_UNIQUE_CARRIER_ZW,ORIGIN_MCO,ORIGIN_ORD,ORGIN_WTH_precipprob_100,ORGIN_WTH_severerisk_5.0,ORGIN_WTH_severerisk_8.0,ORGIN_WTH_severerisk_10.0,ORGIN_WTH_severerisk_15.0,ORGIN_WTH_severerisk_19.0,ORGIN_WTH_severerisk_25.0,ORGIN_WTH_severerisk_30.0,ORGIN_WTH_severerisk_38.0,ORGIN_WTH_severerisk_50.0,ORGIN_WTH_severerisk_60.0,ORGIN_WTH_severerisk_75.0,ORGIN_WTH_severerisk_100.0,DEST_WTH_precipprob_100,DEST_WTH_severerisk_5.0,DEST_WTH_severerisk_8.0,DEST_WTH_severerisk_10.0,DEST_WTH_severerisk_15.0,DEST_WTH_severerisk_30.0,DEST_WTH_severerisk_60.0,DEST_WTH_severerisk_75.0,DEST_WTH_severerisk_100.0,FORMER_FLIGHT_STATUS_late,FORMER_FLIGHT_STATUS_on-time,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12
2834,39.5,0.0,0.0,13.3,315.0,7.2,9.9,35.1,0.0,0.0,6.1,272.0,100.0,6.3,14,550,627,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
4793,79.5,0.0,0.0,5.8,272.0,0.0,9.9,89.9,0.0,0.0,0.4,355.0,26.2,9.9,1,810,882,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
5974,64.9,0.0,0.0,6.7,234.0,50.0,9.9,56.9,0.0,0.0,9.0,260.0,100.0,9.9,11,835,1009,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False
13,29.3,0.0,0.0,11.2,20.0,100.0,9.9,19.1,0.0,0.0,5.0,320.0,47.3,9.9,3,775,852,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3504,41.4,0.16,0.0,5.3,344.0,100.0,3.9,32.2,0.0,0.0,3.5,111.0,89.6,7.0,19,1350,1434,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False


3427   -12.00
5107   -23.00
5413    58.00
4145     2.00
2160   -26.00
Name: ARR_DELAY, dtype: float64

2834     3.00
4793   -10.00
5974   -21.00
13      12.00
3504    66.00
Name: ARR_DELAY, dtype: float64

In [41]:
def sMAPE_metric(actual_values, predicted_values):
    actual_predicted_absolute_sum = np.abs(actual_values) + np.abs(predicted_values)
    actual_predicted_absolute_diff = np.abs(actual_values - predicted_values)
    sMAPE = np.mean(actual_predicted_absolute_diff / actual_predicted_absolute_sum)
    return sMAPE

In [46]:
gb = GradientBoostingRegressor(random_state=50, min_samples_leaf = 2, min_samples_split = 6, max_depth = 5)

gb = gb.fit(X_train, y_train) 
test_output_gb = pd.DataFrame(gb.predict(X_test), index = X_test.index, columns = ['pred_ARR_DELAY'])
test_output_gb = test_output_gb.merge(y_test, left_index = True, right_index = True)
test_output_gb.head()
mean_absolute_error_gb = abs(test_output_gb['pred_ARR_DELAY'] - test_output_gb['ARR_DELAY']).mean()
print(f"GradientBoostingRegressor mean absolute error is: {mean_absolute_error_gb:.2f}")
print(f"GradientBoostingRegressor error ratio: {abs(test_output_gb['pred_ARR_DELAY'] - test_output_gb['ARR_DELAY']).mean()/test_output_gb['ARR_DELAY'].mean():.2f}")
test_output_gb['ARR_DELAY'].mean()

Unnamed: 0,pred_ARR_DELAY,ARR_DELAY
2834,3.56,3.0
4793,3.74,-10.0
5974,-3.14,-21.0
13,16.61,12.0
3504,55.76,66.0


GradientBoostingRegressor mean absolute error is: 27.33
GradientBoostingRegressor error ratio: 2.17


12.575645756457565

In [45]:
print(f"sMAPE for GradientBoostingRegressor model: {sMAPE_metric(test_output_gb['pred_ARR_DELAY'], test_output_gb['ARR_DELAY']) * 100:.2f}")

sMAPE for GradientBoostingRegressor model: 69.88
