In [1]:
# imports
import pandas as pd 
import numpy as np
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [2]:
# read in data
model_data = pd.read_csv('data/model_data.csv', index_col = 0)
y = model_data[['ARRIVAL_DELAY']]
x = model_data.drop('ARRIVAL_DELAY', axis = 1)
print x.shape
print y.shape
print list(x.columns)
x.head()

(88500, 18)
(88500, 1)
['DAY', 'MONTH', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'TAIL_NUMBER', 'TEMPERATURE', 'ICON', 'HUMIDITY', 'SUMMARY', 'WIND_SPEED', 'VISIBILITY', 'PRECIP_INTENSITY', 'PRECIP_PROB', 'MODEL', 'YEAR']


Unnamed: 0,DAY,MONTH,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,TAIL_NUMBER,TEMPERATURE,ICON,HUMIDITY,SUMMARY,WIND_SPEED,VISIBILITY,PRECIP_INTENSITY,PRECIP_PROB,MODEL,YEAR
135485,28,7,2,AA,ORD,CLT,7,N939UW,3,clear-day,3,Clear,6,5,1,1,757-2B7,1994
64198,10,7,5,EV,ORD,MEM,12,N12552,2,partly-cloudy-day,2,Partly Cloudy,3,5,1,1,EMB-145LR,2002
138728,25,9,5,UA,ORD,RSW,7,N26232,2,clear-day,4,Clear,6,4,1,1,737-824,1999
21287,25,5,1,OO,ORD,BOI,12,N124SY,0,wind,3,Breezy and Overcast,8,4,1,4,ERJ 170-200 LR,2014
127549,16,3,1,UA,ORD,SMF,19,N36447,3,clear-night,2,Clear,7,5,1,1,737-924ER,2012


In [10]:
# select features for SVM
svm_features = x[['AIRLINE','MONTH', 'DAY_OF_WEEK', 'DESTINATION_AIRPORT','SCHEDULED_DEPARTURE', 'MODEL', 'YEAR', 'TEMPERATURE', 'WIND_SPEED', 'VISIBILITY', 'PRECIP_PROB', 'PRECIP_INTENSITY', 'HUMIDITY']]
print svm_features.shape
print svm_features.columns
svm_features.head()

# encode non-numeric features
svm_features = pd.concat([svm_features.drop('AIRLINE', axis = 1), pd.get_dummies(svm_features[['AIRLINE']])], axis = 1)
svm_features = pd.concat([svm_features, pd.get_dummies(svm_features['MODEL'])], axis = 1)
svm_features = svm_features.drop('MODEL', axis = 1)
svm_features = pd.concat([svm_features, pd.get_dummies(svm_features['DESTINATION_AIRPORT'])], axis = 1)
svm_features = svm_features.drop('DESTINATION_AIRPORT', axis = 1)
svm_features = pd.concat([svm_features, pd.get_dummies(svm_features['YEAR'])], axis = 1)
svm_features = svm_features.drop('YEAR', axis = 1)
svm_features = svm_features.fillna(0)
print svm_features.shape

(88500, 13)
Index([u'AIRLINE', u'MONTH', u'DAY_OF_WEEK', u'DESTINATION_AIRPORT',
       u'SCHEDULED_DEPARTURE', u'MODEL', u'YEAR', u'TEMPERATURE',
       u'WIND_SPEED', u'VISIBILITY', u'PRECIP_PROB', u'PRECIP_INTENSITY',
       u'HUMIDITY'],
      dtype='object')
(88500, 251)


In [4]:
svm_features.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,SCHEDULED_DEPARTURE,TEMPERATURE,WIND_SPEED,VISIBILITY,PRECIP_PROB,PRECIP_INTENSITY,HUMIDITY,AIRLINE_AA,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
135485,7,2,7,3,6,5,1,1,3,1,...,0,0,0,0,0,0,0,0,0,0
64198,7,5,12,2,3,5,1,1,2,0,...,0,0,0,0,0,0,0,0,0,0
138728,9,5,7,2,6,4,1,1,4,0,...,0,0,0,0,0,0,0,0,0,0
21287,5,1,12,0,8,4,4,1,3,0,...,0,0,0,0,0,0,0,1,0,0
127549,3,1,19,3,7,5,1,1,2,0,...,0,0,0,0,0,1,0,0,0,0


svm_features.head()

In [43]:
train_x, test_x, train_y, test_y = train_test_split(svm_features, y, train_size=0.8)
print train_y.shape
print train_x.shape

(70800, 1)
(70800, 251)


In [5]:
#svm with sampled data
svm_classifier = SVC()
svm_classifier.fit(train_x, np.ravel(train_y, order = 'C'))
svm_train_acc = metrics.accuracy_score(train_y, svm_classifier.predict(train_x))
svm_test_acc = metrics.accuracy_score(test_y, svm_classifier.predict(test_x))

# results
print ('svm train acc: ' + str(svm_train_acc))
print ('svm test acc: ' + str(svm_test_acc))
print ('svm precision: ' + str(metrics.precision_score(test_y, svm_classifier.predict(test_x), average = 'weighted')))
print ('svm recall: ' + str(metrics.recall_score(test_y, svm_classifier.predict(test_x), average = 'weighted')))

svm train acc: 0.454548022599
svm test acc: 0.447570621469
svm precision: 0.446571701621
svm recall: 0.447570621469


In [30]:
# decision tree with sampled data
dtc = tree.DecisionTreeClassifier()
dtc.fit(train_x, train_y)
dtc_train_acc = metrics.accuracy_score(train_y, dtc.predict(train_x))
dtc_test_acc = metrics.accuracy_score(test_y, dtc.predict(test_x))

# results
print ('decision tree train acc: ' + str(dtc_train_acc))
print ('decision tree test acc: ' + str(dtc_test_acc))
print ('decision tree precision: ' + str(metrics.precision_score(test_y, dtc.predict(test_x), average = 'weighted')))
print ('decision tree recall: ' + str(metrics.recall_score(test_y, dtc.predict(test_x), average = 'weighted')))

decision tree train acc: 0.999223163842
decision tree test acc: 0.427740112994
decision tree precision: 0.427370468576
decision tree recall: 0.427740112994


In [44]:
# random forest w/ sampled data
rfc = RandomForestClassifier()
rfc.fit(train_x, train_y)
rfc_train_acc = metrics.accuracy_score(train_y, rfc.predict(train_x))
rfc_test_acc = metrics.accuracy_score(test_y, rfc.predict(test_x))
pred_y = rfc.predict(test_x)

# results
print ('random forest train acc: ' + str(rfc_train_acc))
print ('random forest test acc: ' + str(rfc_test_acc))
print ('random forest precision: ' + str(metrics.precision_score(test_y, rfc.predict(test_x), average = 'weighted')))
print ('random forest recall: ' + str(metrics.recall_score(test_y, rfc.predict(test_x), average = 'weighted')))

  This is separate from the ipykernel package so we can avoid doing imports until


random forest train acc: 0.983629943503
random forest test acc: 0.427005649718
random forest precision: 0.428613243343
random forest recall: 0.427005649718


In [23]:
cm = confusion_matrix(test_y, pred_y)
sns.set(font_scale=1.5)
sns.set_style(style='white')
cmap = sns.cubehelix_palette(as_cmap=True)
ax = sns.heatmap(cm, cmap=cmap)
ax.set(xlabel='Predicted Delay', ylabel='Actual Delay')
plt.xticks(rotation=25)
plt.tight_layout()
plt.savefig('CM.png')

In [45]:
# models with unsampled data
unsampled_data = pd.read_csv("data/weather_features_buckets.csv", index_col = 0)
print unsampled_data.shape
unsampled_data = unsampled_data[unsampled_data.apply(lambda x: len(x['MODEL']) <= 25 and x['YEAR'] != 'None', axis=1)]
print unsampled_data.shape
unsampled_data = unsampled_data.dropna()
print unsampled_data.shape
unsampled_data = unsampled_data.sample(100000)
print unsampled_data.shape

(139570, 19)
(135428, 19)
(135428, 19)
(100000, 19)


In [46]:
y = unsampled_data[['ARRIVAL_DELAY']]
x = unsampled_data.drop('ARRIVAL_DELAY', axis = 1)

In [47]:
# select features for SVM
features = x[['AIRLINE','MONTH', 'DAY_OF_WEEK', 'DESTINATION_AIRPORT','SCHEDULED_DEPARTURE', 'MODEL', 'YEAR', 'TEMPERATURE', 'WIND_SPEED', 'VISIBILITY', 'PRECIP_PROB', 'PRECIP_INTENSITY', 'HUMIDITY']]
print features.shape

# encode non-numeric features
features = pd.concat([features.drop('AIRLINE', axis = 1), pd.get_dummies(features[['AIRLINE']])], axis = 1)
features = pd.concat([features, pd.get_dummies(features['MODEL'])], axis = 1)
features = features.drop('MODEL', axis = 1)
features = pd.concat([features, pd.get_dummies(features['DESTINATION_AIRPORT'])], axis = 1)
features = features.drop('DESTINATION_AIRPORT', axis = 1)
features = pd.concat([features, pd.get_dummies(features['YEAR'])], axis = 1)
features = features.drop('YEAR', axis = 1)
features = features.fillna(0)
print features.shape

(100000, 13)
(100000, 251)


In [57]:
train_x, test_x, train_y, test_y = train_test_split(features, y, train_size=0.8)
print train_y.shape
print train_x.shape

(80000, 1)
(80000, 251)


In [13]:
#svm with unsampled data
svm_classifier = SVC()
svm_classifier.fit(train_x, np.ravel(train_y, order = 'C'))
svm_train_acc = metrics.accuracy_score(train_y, svm_classifier.predict(train_x))
svm_test_acc = metrics.accuracy_score(test_y, svm_classifier.predict(test_x))

# results
print ('svm train acc: ' + str(svm_train_acc))
print ('svm test acc: ' + str(svm_test_acc))
print ('svm precision: ' + str(metrics.precision_score(test_y, svm_classifier.predict(test_x), average = 'weighted')))
print ('svm recall: ' + str(metrics.recall_score(test_y, svm_classifier.predict(test_x), average = 'weighted')))

svm train acc: 0.560725
svm test acc: 0.55835


  'precision', 'predicted', average, warn_for)


svm precision: 0.442413817088
svm recall: 0.55835


In [51]:
# decision tree with unsampled data
dtc = tree.DecisionTreeClassifier()
dtc.fit(train_x, train_y)
dtc_train_acc = metrics.accuracy_score(train_y, dtc.predict(train_x))
dtc_test_acc = metrics.accuracy_score(test_y, dtc.predict(test_x))

# results
print ('decision tree train acc: ' + str(dtc_train_acc))
print ('decision tree test acc: ' + str(dtc_test_acc))
print ('decision tree precision: ' + str(metrics.precision_score(test_y, dtc.predict(test_x), average = 'weighted')))
print ('decision tree recall: ' + str(metrics.recall_score(test_y, dtc.predict(test_x), average = 'weighted')))

decision tree train acc: 0.9992875
decision tree test acc: 0.49445
decision tree precision: 0.489647037923
decision tree recall: 0.49445


In [58]:
# random forest w/ unsampled data
rfc = RandomForestClassifier()
rfc.fit(train_x, train_y)
rfc_train_acc = metrics.accuracy_score(train_y, rfc.predict(train_x))
rfc_test_acc = metrics.accuracy_score(test_y, rfc.predict(test_x))

# results
print ('random forest train acc: ' + str(rfc_train_acc))
print ('random forest test acc: ' + str(rfc_test_acc))
print ('random forest precision: ' + str(metrics.precision_score(test_y, rfc.predict(test_x), average = 'weighted')))
print ('random forest recall: ' + str(metrics.recall_score(test_y, rfc.predict(test_x), average = 'weighted')))

  This is separate from the ipykernel package so we can avoid doing imports until


random forest train acc: 0.9774
random forest test acc: 0.5502
random forest precision: 0.503864695405
random forest recall: 0.5502
