In [51]:
# Importing all necessary libraries
# munging imports
import pandas as pd
import numpy as np

# visualization imports
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline

# modeling imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, precision_recall_curve,f1_score, fbeta_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# Model Exploration

*In this notebook, I will use a smaller subset of the data to explore and compare the models where I need to dummify variables before choosing a final model and training on all the available data points.*

Pull in a sample of the data.

In [53]:
#pull in a 10,000 datapoint sample from our dataframe, with unecessary columns dropped.
sample_df=pd.read_csv('/Users/mehikapatel/Flights_Project/Data/FinalFlightsData.csv').drop(columns=['Unnamed: 0',
                                                                                                    'DEP_TIME',
                                                                                                    'DEP_DELAY',
                                                                                                    'ARR_DELAY',
                                                                                                    'CANCELLED',
                                                                                                    'DATE',
                                                                                                    'FLIGHT_NUM',
                                                                                                    'origin_lat',
                                                                                                    'origin_lon',
                                                                                                    'dest_lat',
                                                                                                    'dest_lon',
                                                                                                    'CRS_DEP_TIME',
                                                                                                    'CRS_ARR_TIME',
                                                                                                   'municipality1',
                                                                                                   'municipality2']).sample(n=40000,
                                                                                                                        random_state=6)

Dummify all features to make them viable for model creation.

In [54]:
#Airline
sample_df = sample_df.join(pd.get_dummies(sample_df['AIRLINE'],drop_first=True))

#drop original
sample_df.drop('AIRLINE',axis=1,inplace=True)

In [55]:
#Origin
sample_df = sample_df.join(pd.get_dummies(sample_df['ORIGIN'],drop_first=True))

#drop original
sample_df.drop('ORIGIN',axis=1,inplace=True)

Because we have matching values in the origin and dest columns and have already made dummy variables for all our origin airports, we must change those in the destination column slightly to be able to make dummy variables pertaining to the destination. So, we coerce the destination column by adding "2" to each data point before making dummies to indicate destination airport code.

In [56]:
sample_df['DEST'] = sample_df['DEST'].astype(str) + '2'

In [57]:
#Dest
sample_df = sample_df.join(pd.get_dummies(sample_df['DEST'],drop_first=True))

#drop original
sample_df.drop('DEST',axis=1,inplace=True)

In [58]:
#Month
    #two features cos and sin:
sample_df['sin_month']=np.sin(2*np.pi*sample_df.MONTH/12)
sample_df['cos_month']=np.cos(2*np.pi*sample_df.MONTH/12)

#drop original
sample_df.drop('MONTH',axis=1,inplace=True)

In [59]:
#Day of Week
    #two features cos and sin:
sample_df['sin_DOW']=np.sin(2*np.pi*sample_df.DAYOFWEEK/7)
sample_df['cos_DOW']=np.cos(2*np.pi*sample_df.DAYOFWEEK/7)

#drop original
sample_df.drop('DAYOFWEEK',axis=1,inplace=True)

In [60]:
#origin Type
sample_df = sample_df.join(pd.get_dummies(sample_df['origin_type'],drop_first=True))

#drop original
sample_df.drop('origin_type',axis=1,inplace=True)

Similar to above with our airport codes, we have to slightly change the destination type by adding "2" .

In [61]:
sample_df['dest_type'] = sample_df['dest_type'].astype(str) + '2'

In [62]:
# dest type
sample_df = sample_df.join(pd.get_dummies(sample_df['dest_type'],drop_first=True))

#drop original
sample_df.drop('dest_type',axis=1,inplace=True)

In [63]:
#Dep hour
sample_df['sin_DEP_HOUR']=np.sin(2*np.pi*sample_df.DEP_HOUR/24)
sample_df['cos_DEP_HOUR']=np.cos(2*np.pi*sample_df.DEP_HOUR/24)

#drop original
sample_df.drop('DEP_HOUR',axis=1,inplace=True)

In [64]:
# Arr hour
sample_df['sin_ARR_HOUR']=np.sin(2*np.pi*sample_df.ARR_HOUR/24)
sample_df['cos_ARR_HOUR']=np.cos(2*np.pi*sample_df.ARR_HOUR/24)

#drop original
sample_df.drop('ARR_HOUR',axis=1,inplace=True)

In [65]:
# origin weather
sample_df = sample_df.join(pd.get_dummies(sample_df['origin_weather'],drop_first=True))

#drop original
sample_df.drop('origin_weather',axis=1,inplace=True)

Again, we will do the same for destination weather.

In [68]:
sample_df['dest_weather'] = sample_df['dest_weather'].astype(str) + '2'

In [69]:
# dest weather
sample_df = sample_df.join(pd.get_dummies(sample_df['dest_weather'],drop_first=True))

#drop original
sample_df.drop('dest_weather',axis=1,inplace=True)

We will be converting the weather severity columns into ordinal data.

Specifically, there are 6 values:
1. Light
2. Moderate
3. Severe
4. UNK (unknown)
5. Heavy
6. Other

For the light,moderate, heavy, and severe, we will make light = 1, moderate = 2, heavy = 3, severe =4. 
The other categories will be removed from the dataset since we do not definetively know, and this could skew our results. 

In [70]:
#Drop unknown and others fully from dataset considering they have weather data but no severity level.
sample_df = sample_df[sample_df.origin_severity != 'UNK']
sample_df = sample_df[sample_df.origin_severity != 'Other']

In [71]:
# origin severity

# lambda x: x*10 if x<2 else (x**2 if x<4 else x+10)

sample_df['origin_sev'] =sample_df['origin_severity'].apply(lambda x: 1 if x == "Light" else(2 if x == "Moderate" else(3 if x == "Heavy" else (4 if x== "Severe" else 0 ))))
sample_df.drop(columns=['origin_severity'],inplace=True)


In [72]:
# dest severity
sample_df['dest_sev'] =sample_df['dest_severity'].apply(lambda x: 1 if x == "Light" else(2 if x == "Moderate" else(3 if x == "Heavy" else (4 if x== "Severe" else 0 ))))

#drop original columns for origin and dest severity
sample_df.drop(columns=['dest_severity'],inplace=True)

## Metric Decision: F1

The metric that will be used to compare models will be the balanced F1 metric to find an optimal balance between precision and recall. We imagine it is equally harmful/beneficial to increase our recall rate as it is to increase our precision rate when classifiying a future flight as "delayed" or not. 

## Dealing with Class Imbalance:

In [73]:
sample_df.target.value_counts()

False    32494
True      6943
Name: target, dtype: int64

In [74]:
Fal = 8148
Tru = 1715
Tot = Fal + Tru

print(f'The percentage of NOT Delayed is {round((Fal/Tot)*100)}%')
print(f'The percentage of Delayed is {round((Tru/Tot)*100)}%')

The percentage of NOT Delayed is 83%
The percentage of Delayed is 17%


Because our target classes are strongly imbalanced (83% vs 17%), we will use class_weights when modeling to make sure our models catch our smaller class.

## Data Split

In [220]:
#split training and test sets:
X = sample_df.drop(columns=['target'])
y = sample_df.target

#split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

## Create our first Tree models with the sample data with all the features dummified.

Create RF Models.

In [76]:
from sklearn.ensemble import RandomForestClassifier

In [208]:
rf_model = RandomForestClassifier(class_weight={0:1,1:5.2},n_estimators=11)
rf_model.fit(X_train,y_train)

RandomForestClassifier(class_weight={0: 1, 1: 5.2}, n_estimators=11)

In [209]:
y_pred = rf_model.predict(X_test)

In [210]:
#1:5.2 weights 
# 11 estimators
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8133029073698445
Precision Score: 0.325
Recall Score: 0.05616898703792607
F1 Score: 0.09578387228817029


In [187]:
#1:5.2 weights 
# 11 estimators
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8133874239350912
Precision Score: 0.32492997198879553
Recall Score: 0.0556889102256361
F1 Score: 0.09508196721311475


In [149]:
#1:5 weights 
# 11 estimators
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8134719405003381
Precision Score: 0.3037974683544304
Recall Score: 0.046087373979836775
F1 Score: 0.08003334722801167


In [139]:
#1:5 weights 
# 15 estimators
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8176132521974308
Precision Score: 0.33624454148471616
Recall Score: 0.03696591454632741
F1 Score: 0.06660899653979238


In [136]:
#1:5 weights 
# 20 estimators
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8201487491548344
Precision Score: 0.3404255319148936
Recall Score: 0.023043686989918388
F1 Score: 0.04316546762589928


In [121]:
#1:5 weights 
# 75 estimators
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8223461798512508
Precision Score: 0.38271604938271603
Recall Score: 0.014882381180988958
F1 Score: 0.028650646950092423


In [127]:
#1:5 weights 
# 50 estimators
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8223461798512508
Precision Score: 0.3855421686746988
Recall Score: 0.015362457993278924
F1 Score: 0.029547553093259463


In [130]:
#1:5 weights 
# 30 estimators
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8199797160243407
Precision Score: 0.2920353982300885
Recall Score: 0.01584253480556889
F1 Score: 0.030054644808743168


In [79]:
# 500 estimators
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8225997295469912
Precision Score: 0.3888888888888889
Recall Score: 0.01344215074411906
F1 Score: 0.025986078886310902


In [82]:
#200 estimators
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8217545638945233
Precision Score: 0.30303030303030304
Recall Score: 0.009601536245799328
F1 Score: 0.018613308515588643


In [85]:
#1000 estimators
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8225997295469912
Precision Score: 0.36666666666666664
Recall Score: 0.01056168987037926
F1 Score: 0.02053196453569762


In [109]:
#500 with weights
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8225997295469912
Precision Score: 0.3709677419354839
Recall Score: 0.011041766682669226
F1 Score: 0.021445221445221443


In [90]:
#600 estimators
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8225997295469912
Precision Score: 0.3709677419354839
Recall Score: 0.011041766682669226
F1 Score: 0.021445221445221443


In [93]:
#100 estimators:
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8218390804597702
Precision Score: 0.323943661971831
Recall Score: 0.011041766682669226
F1 Score: 0.021355617455896005


In [103]:
#450 estimators
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8220081135902637
Precision Score: 0.35064935064935066
Recall Score: 0.012962073931829092
F1 Score: 0.025


In [112]:
#100 estimators with weight
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8230223123732252
Precision Score: 0.4225352112676056
Recall Score: 0.014402304368698993
F1 Score: 0.02785515320334262


In [115]:
#50 estimators with weight
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8216700473292765
Precision Score: 0.34831460674157305
Recall Score: 0.014882381180988958
F1 Score: 0.0285451197053407


In [118]:
#75 estimators with weight
print(f'Accuracy Score: {metrics.accuracy_score(y_test, y_pred)}')
print(f'Precision Score: {metrics.precision_score(y_test, y_pred)}')
print(f'Recall Score: {metrics.recall_score(y_test, y_pred)}')
print(f'F1 Score: {metrics.f1_score(y_test, y_pred)}')

Accuracy Score: 0.8225152129817445
Precision Score: 0.38666666666666666
Recall Score: 0.013922227556409025
F1 Score: 0.026876737720111215


## XG BOOST

In [215]:
import xgboost as xgb

In [221]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression


#Split data into 3: 60% train, 20% validation, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2018)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2019)

#Evaluate models with Root Mean Squared Error
def rmse(actuals, preds):
    return np.sqrt(((actuals - preds) ** 2).mean())

In [222]:
gbm = xgb.XGBRegressor( 
                       n_estimators=30000, #arbitrary large number
                       max_depth=3,
                       objective="reg:squarederror",  # Other options: https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters
                       learning_rate=.1, 
                       subsample=1,
                       min_child_weight=1,
                       colsample_bytree=.8
                      )

eval_set=[(X_train,y_train),(X_val,y_val)] #tracking train/validation error as we go
fit_model = gbm.fit( 
                    X_train, y_train, 
                    eval_set=eval_set,
                    eval_metric='rmse',
                    early_stopping_rounds=20,
                    verbose=True #gives output log as below
                   )

[0]	validation_0-rmse:0.47934	validation_1-rmse:0.47883
[1]	validation_0-rmse:0.46196	validation_1-rmse:0.46097
[2]	validation_0-rmse:0.44730	validation_1-rmse:0.44583
[3]	validation_0-rmse:0.43505	validation_1-rmse:0.43305
[4]	validation_0-rmse:0.42484	validation_1-rmse:0.42240
[5]	validation_0-rmse:0.41638	validation_1-rmse:0.41346
[6]	validation_0-rmse:0.40932	validation_1-rmse:0.40603
[7]	validation_0-rmse:0.40348	validation_1-rmse:0.39989
[8]	validation_0-rmse:0.39868	validation_1-rmse:0.39480
[9]	validation_0-rmse:0.39468	validation_1-rmse:0.39045
[10]	validation_0-rmse:0.39138	validation_1-rmse:0.38688
[11]	validation_0-rmse:0.38867	validation_1-rmse:0.38399
[12]	validation_0-rmse:0.38643	validation_1-rmse:0.38156
[13]	validation_0-rmse:0.38457	validation_1-rmse:0.37957
[14]	validation_0-rmse:0.38302	validation_1-rmse:0.37789
[15]	validation_0-rmse:0.38173	validation_1-rmse:0.37647
[16]	validation_0-rmse:0.38067	validation_1-rmse:0.37534
[17]	validation_0-rmse:0.37980	validation

[144]	validation_0-rmse:0.36843	validation_1-rmse:0.36783
[145]	validation_0-rmse:0.36837	validation_1-rmse:0.36782
[146]	validation_0-rmse:0.36835	validation_1-rmse:0.36783
[147]	validation_0-rmse:0.36831	validation_1-rmse:0.36782
[148]	validation_0-rmse:0.36824	validation_1-rmse:0.36782
[149]	validation_0-rmse:0.36822	validation_1-rmse:0.36782
[150]	validation_0-rmse:0.36819	validation_1-rmse:0.36783
[151]	validation_0-rmse:0.36816	validation_1-rmse:0.36784
[152]	validation_0-rmse:0.36812	validation_1-rmse:0.36783


In [231]:
# predict using n_estimators with lowest validation error
y_pred = gbm.predict(X_test, ntree_limit=gbm.best_ntree_limit)

In [226]:
rmse(gbm.predict(X_val, ntree_limit=gbm.best_ntree_limit),y_val) 

0.36781357086855454