In [2]:
import pandas as pd
from category_encoders import MEstimateEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# load all data into dataframes - update path accordingly
df1 = pd.read_csv('/Users/tboy53/Desktop/transfer.pcloud.Lrpdxvg8/final_cleaned_data_2009-2013.csv')
df2 = pd.read_csv('/Users/tboy53/Desktop/transfer.pcloud.Lrpdxvg8/final_cleaned_data_2014-2018.csv')

# drop unwanted features
df1 = df1.drop(['OP_CARRIER_FL_NUM','DEP_TIME','TAXI_OUT','WHEELS_OFF','WHEELS_ON','TAXI_IN','ARR_TIME','AIR_TIME','ACTUAL_ELAPSED_TIME','DIVERTED','DEP_DELAY','CANCELLATION_CODE','CARRIER_DELAY','WEATHER_DELAY','NAS_DELAY','SECURITY_DELAY','LATE_AIRCRAFT_DELAY'], axis=1)
df2 = df2.drop(['OP_CARRIER_FL_NUM','DEP_TIME','TAXI_OUT','WHEELS_OFF','WHEELS_ON','TAXI_IN','ARR_TIME','AIR_TIME','ACTUAL_ELAPSED_TIME','DIVERTED','DEP_DELAY','CANCELLATION_CODE','CARRIER_DELAY','WEATHER_DELAY','NAS_DELAY','SECURITY_DELAY','LATE_AIRCRAFT_DELAY'], axis=1)

# combine dataframes
combined_df = pd.concat([df1,df2])

In [3]:
# cast feature types
combined_df['FL_DATE'] = pd.to_datetime(combined_df['FL_DATE'], format = '%Y-%m-%d', errors = 'coerce')
combined_df['CRS_DEP_TIME'] = pd.to_datetime(combined_df['CRS_DEP_TIME'], format = '%H%M', errors = 'coerce')
combined_df['CRS_ARR_TIME'] = pd.to_datetime(combined_df['CRS_ARR_TIME'], format = '%H%M', errors = 'coerce')
combined_df['DISTANCE'] = combined_df['DISTANCE'].astype('float')
combined_df['CRS_ELAPSED_TIME'] = combined_df['CRS_ELAPSED_TIME'].astype('float')
combined_df['ARR_DELAY'] = combined_df['ARR_DELAY'].astype('float')
combined_df['CANCELLED'] = combined_df['CANCELLED'].astype('int')

# split up date and time features into separate columns
combined_df['FL_YR'] = combined_df['FL_DATE'].dt.year
combined_df['FL_MONTH'] = combined_df['FL_DATE'].dt.month
combined_df['FL_DAY'] = combined_df['FL_DATE'].dt.day
combined_df['FL_DOW'] = combined_df['FL_DATE'].dt.dayofweek # monday=0, sunday=6

combined_df['CRS_DEP_HR'] = combined_df['CRS_DEP_TIME'].dt.hour
combined_df['CRS_DEP_MIN'] = combined_df['CRS_DEP_TIME'].dt.minute
combined_df['CRS_ARR_HR'] = combined_df['CRS_ARR_TIME'].dt.hour
combined_df['CRS_ARR_MIN'] = combined_df['CRS_ARR_TIME'].dt.minute
combined_df = combined_df.drop(['FL_DATE','CRS_DEP_TIME','CRS_ARR_TIME'], axis=1)

print(' INITIAL COMBINED DF ')
print(combined_df.head(10))

# create train and test random samples
x_train_i = combined_df.sample(n=800000)
x_test_i = combined_df.sample(n=200000)

# one hot encode OP_CARRIER
encoded_train = pd.get_dummies(x_train_i, columns = ['OP_CARRIER'], dtype=int)
encoded_test = pd.get_dummies(x_test_i, columns = ['OP_CARRIER'], dtype=int)

# helper functions for N groupings
def n0_delay(arr_delay):
    if arr_delay >= 15.0 and arr_delay < 30.0:
        return 1
    else:
        return 0
    
def n1_delay(arr_delay):
    if arr_delay >= 30.0 and arr_delay < 60.0:
        return 1
    else:
        return 0
    
def n2_delay(arr_delay):
    if arr_delay >= 60.0 and arr_delay < 120.0:
        return 1
    else:
        return 0
    
def n3_delay(arr_delay):
    if arr_delay >= 120.0:
        return 1
    else:
        return 0
    
encoded_train['n0'] = encoded_train['ARR_DELAY'].apply(n0_delay)
encoded_train['n1'] = encoded_train['ARR_DELAY'].apply(n1_delay)
encoded_train['n2'] = encoded_train['ARR_DELAY'].apply(n2_delay)
encoded_train['n3'] = encoded_train['ARR_DELAY'].apply(n3_delay)

encoded_test['n0'] = encoded_test['ARR_DELAY'].apply(n0_delay)
encoded_test['n1'] = encoded_test['ARR_DELAY'].apply(n1_delay)
encoded_test['n2'] = encoded_test['ARR_DELAY'].apply(n2_delay)
encoded_test['n3'] = encoded_test['ARR_DELAY'].apply(n3_delay)

# target encoding with smoothing (m-estimate) for ORIGIN and DEST features
x1 = encoded_train.copy()

x_encode = x1.sample(frac=0.25)
y_encode = x_encode.pop('ARR_DELAY')
x_pretrain = x1.drop(x_encode.index)
y_train = x_pretrain.pop('ARR_DELAY')

encoder = MEstimateEncoder(cols=['ORIGIN','DEST'], m=100.0) # can change m to see how results change
encoder.fit(x_encode, y_encode)
encoded_df_train = encoder.transform(x_pretrain)
print(' ')
print(' TRAIN ENCODED DF ')
print(encoded_df_train.head(10))

x2 = encoded_test.copy()

x_encode2 = x2.sample(frac=0.25)
y_encode2 = x_encode2.pop('ARR_DELAY')
x_pretrain2 = x2.drop(x_encode2.index)
y_train2 = x_pretrain2.pop('ARR_DELAY')

encoder2 = MEstimateEncoder(cols=['ORIGIN','DEST'], m=100.0)
encoder2.fit(x_encode2, y_encode2)
encoded_df_test = encoder2.transform(x_pretrain2)
print(' ')
print(' TEST ENCODED DF ')
print(encoded_df_test.head(10))

# used below code to try and visualize m-encoding to make sure something was indeed happening
'''
plt.figure(dpi=100)
ax = sns.histplot(y_train)
ax = sns.kdeplot(encoded_df['DEST'], color='r', ax=ax)
ax.set_xlabel("ARR_DELAY")
ax.legend(labels=['DEST','ARR_DELAY'])
ax.set_xlim(0.0,60.0)
ax.set_ylim(0,2)
'''

 INITIAL COMBINED DF 
  OP_CARRIER ORIGIN DEST  ARR_DELAY  CANCELLED  CRS_ELAPSED_TIME  DISTANCE  \
0         XE    DCA  EWR        4.0          0              62.0     199.0   
1         XE    EWR  IAD       -8.0          0              82.0     213.0   
2         XE    EWR  DCA       -9.0          0              70.0     199.0   
3         XE    DCA  EWR      -12.0          0              77.0     199.0   
4         XE    IAD  EWR      -38.0          0             105.0     213.0   
5         XE    ATL  EWR      -19.0          0             147.0     745.0   
6         XE    CLE  ATL      -17.0          0             117.0     554.0   
7         XE    DCA  EWR       -8.0          0              80.0     199.0   
8         XE    EWR  DCA      -15.0          0              83.0     199.0   
9         XE    EWR  DCA      -12.0          0              68.0     199.0   

   FL_YR  FL_MONTH  FL_DAY  FL_DOW  CRS_DEP_HR  CRS_DEP_MIN  CRS_ARR_HR  \
0   2009         1       1       3        11

'\nplt.figure(dpi=100)\nax = sns.histplot(y_train)\nax = sns.kdeplot(encoded_df[\'DEST\'], color=\'r\', ax=ax)\nax.set_xlabel("ARR_DELAY")\nax.legend(labels=[\'DEST\',\'ARR_DELAY\'])\nax.set_xlim(0.0,60.0)\nax.set_ylim(0,2)\n'

In [4]:
# create final train and test random samples
x_train = encoded_df_train.copy()
x_test = encoded_df_test.copy()

y_train = x_train[['n0','n1','n2','n3','CANCELLED']].copy()
x_train = x_train.drop(['n0','n1','n2','n3','CANCELLED'], axis=1)

y_test = x_test[['n0','n1','n2','n3','CANCELLED']].copy()
x_test = x_test.drop(['n0','n1','n2','n3','CANCELLED'], axis=1)

print(x_train.head(10))
print(y_train.head(10))
print(x_test.head(10))
print(y_test.head(10))

            ORIGIN      DEST  CRS_ELAPSED_TIME  DISTANCE  FL_YR  FL_MONTH  \
17732948  6.686143  5.232612              65.0     239.0   2011        10   
11948858  5.963217  2.725797              83.0     313.0   2016         1   
26194204  3.234900  5.316444             390.0    2475.0   2018         6   
15526406  4.199736  4.910956              67.0     208.0   2011         6   
30992230  4.428758  2.828300             100.0     507.0   2013        12   
22671782  4.041358  3.844107             275.0    1751.0   2017        12   
3275162   6.278816  7.320609             141.0     719.0   2014         7   
17749012  1.990668  3.126702             135.0     846.0   2011        10   
16565091  8.375030  7.320609             137.0     802.0   2016        11   
1437373   3.650618  2.003526              80.0     338.0   2014         4   

          FL_DAY  FL_DOW  CRS_DEP_HR  CRS_DEP_MIN  ...  OP_CARRIER_NW  \
17732948      17       0        20.0          0.0  ...              0   
119488