## Import Chicago Departures

In [1]:
% matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from datetime import datetime
pd.options.display.max_columns = 50

In [2]:
df = pd.read_csv('../Assets/Datasets/BTS_flight_data/Processed/chicago_departures.csv')
df.shape

(1997113, 50)

In [6]:
# Get time in Date/Hour to match hourly weather data
cols = [['CRS_DEP_DATETIME', 'CRS_DEP_DATEHOUR'],
        ['CRS_ARR_DATETIME', 'CRS_ARR_DATEHOUR'],
        ['PRIOR_CRS_DEP_DATETIME', 'PRIOR_CRS_DEP_DATEHOUR']]

for col in cols:
    df[col[0]] = pd.to_datetime( df[col[0]], infer_datetime_format=True)
    df[col[1]] = df[col[0]].apply(lambda x: datetime(x.year, x.month, x.day, x.hour))

## Join Origin Weather Data

In [7]:
# Import and format weather data
orig_weather = pd.read_csv('../Assets/Datasets/Weather/Clean_Dep_Weather_Hourly.csv')
orig_weather = orig_weather[['Airport', 'DateTime', 'Daypart', 'Temp', 'Visibility', 'Wind Speed',
                   'Precip', 'Conditions']]
orig_weather = orig_weather.rename(columns = {'DateTime' : 'DateHour',
                                              'Temp': 'Orig_Temp',
                                              'Visibility' : 'Orig_Visibility',
                                              'Wind Speed' : 'Orig_Wind Speed',
                                              'Precip' : 'Orig_Precip',
                                              'Conditions' : 'Orig_Conditions'})
orig_weather['DateHour'] = pd.to_datetime(orig_weather['DateHour'], infer_datetime_format=True)

# Remove leading 'K' from airport to match df airport codes 
orig_weather['Airport'] = orig_weather['Airport'].apply(lambda x: x[1:])

In [8]:
# Create origin airport weather columns for 3 hours before and 2 hours after
orig_weather = orig_weather.sort_values(['Airport', 'DateHour'])
offsets = [['B1_', 1], ['B2_', 2], ['B3_', 3], ['A1_', -1], ['A2_', -2]]
cols = ['Orig_Temp', 'Orig_Visibility', 'Orig_Wind Speed', 'Orig_Precip', 'Orig_Conditions']

for offset in offsets:
    for col in cols:
        orig_weather[offset[0]+col] = orig_weather[col].shift(offset[1])
        
orig_weather.dropna(inplace=True)

In [9]:
# format BTS data to match weather columns for join
code_match = pd.read_csv('../Assets/Datasets/BTS_flight_data/Match_Tables/AIRPORT_ID_CODE.csv')
df = pd.merge(df, code_match[['US_DOT_AIRPORT_ID', 'US_DOT_AIRPORT_CODE']], 
              left_on='ORIGIN_AIRPORT_ID', right_on='US_DOT_AIRPORT_ID',
              how= 'left')
df.drop('US_DOT_AIRPORT_ID', axis=1, inplace=True)
df = df.rename(columns = {'US_DOT_AIRPORT_CODE':'ORIGIN_AIRPORT_CODE'})


In [10]:
# join weather and BTS on dep airport and departure time
df = pd.merge(df, orig_weather, how='left',
              left_on=['ORIGIN_AIRPORT_CODE', 'CRS_DEP_DATEHOUR'],
              right_on=['Airport', 'DateHour'])

In [11]:
# Remove rows that have no weather info 
df.dropna(subset=['Airport'], inplace=True)
df.drop('Airport', axis= 1, inplace=True)
df.shape

(1991998, 86)

## Join Destination Weather Data

In [None]:
# Import and format weather data
dest_weather = pd.read_csv('../Assets/Datasets/Weather/Clean_Arr_Weather_Hourly.csv')
dest_weather = dest_weather[['Airport', 'DateTime', 'Daypart', 'Temp', 'Visibility', 'Wind Speed',
                   'Precip', 'Conditions']]
dest_weather = dest_weather.rename(columns = {'DateTime' : 'DateHour',
                                              'Temp': 'Dest_Temp',
                                              'Visibility' : 'Dest_Visibility',
                                              'Wind Speed' : 'Dest_Wind Speed',
                                              'Precip' : 'Dest_Precip',
                                              'Conditions' : 'Dest_Conditions'})
dest_weather['DateHour'] = pd.to_datetime(dest_weather['DateHour'], infer_datetime_format=True)

# Remove leading 'K' from airport to match df airport codes 
dest_weather['Airport'] = dest_weather['Airport'].apply(lambda x: x[1:])

In [None]:
# Create destination airport weather columns for 3 hours before and 2 hours after
dest_weather = dest_weather.sort_values(['Airport', 'DateHour'])

offsets = [['B1_', 1], ['B2_', 2], ['B3_', 3], ['A1_', -1], ['A2_', -2]]
cols = ['Dest_Temp', 'Dest_Visibility', 'Dest_Wind Speed', 'Dest_Precip', 'Dest_Conditions']

for offset in offsets:
    for col in cols:
        dest_weather[offset[0]+col] = dest_weather[col].shift(offset[1])
        
dest_weather.dropna(inplace=True)

In [None]:
# format BTS data to match weather columns for join
df = pd.merge(df, code_match[['US_DOT_AIRPORT_ID', 'US_DOT_AIRPORT_CODE']], 
              left_on='DEST_AIRPORT_ID', right_on='US_DOT_AIRPORT_ID',
              how= 'left')
df.drop('US_DOT_AIRPORT_ID', axis=1, inplace=True)
df = df.rename(columns = {'US_DOT_AIRPORT_CODE':'DEST_AIRPORT_CODE'})

In [None]:
# join weather and BTS on dest airport and Arrival Time
df = pd.merge(df, dest_weather, how='left',
              left_on=['DEST_AIRPORT_CODE', 'CRS_ARR_DATEHOUR'],
              right_on=['Airport', 'DateHour'])

In [None]:
# Remove rows that have no weather (will drop any dest with no weather)
df.dropna(subset=['Airport'], inplace=True)
df.shape

## Join Prior Weather

In [12]:
# Import and format weather data
prior_weather = pd.read_csv('../Assets/Datasets/Weather/Clean_Arr_Weather_Hourly.csv')
prior_weather = prior_weather[['Airport', 'DateTime', 'Daypart', 'Temp', 'Visibility', 'Wind Speed',
                   'Precip', 'Conditions']]
prior_weather = prior_weather.rename(columns = {'DateTime' : 'DateHour',
                                              'Temp': 'Prior_Temp',
                                              'Visibility' : 'Prior_Visibility',
                                              'Wind Speed' : 'Prior_Wind Speed',
                                              'Precip' : 'Prior_Precip',
                                              'Conditions' : 'Prior_Conditions'})
prior_weather['DateHour'] = pd.to_datetime(prior_weather['DateHour'], infer_datetime_format=True)

# Remove leading 'K' from airport to match df airport codes 
prior_weather['Airport'] = prior_weather['Airport'].apply(lambda x: x[1:])

In [13]:
# Create prior airport weather columns for 3 hours before and 2 hours after
prior_weather = prior_weather.sort_values(['Airport', 'DateHour'])

offsets = [['B1_', 1], ['B2_', 2], ['B3_', 3], ['A1_', -1], ['A2_', -2]]
cols = ['Prior_Temp', 'Prior_Visibility', 'Prior_Wind Speed', 'Prior_Precip', 'Prior_Conditions']

for offset in offsets:
    for col in cols:
        prior_weather[offset[0]+col] = prior_weather[col].shift(offset[1])
        
prior_weather.dropna(inplace=True)

In [15]:
# format BTS data to match weather columns for join
df = pd.merge(df, code_match[['US_DOT_AIRPORT_ID', 'US_DOT_AIRPORT_CODE']], 
              left_on='PRIOR_AIRPORT', right_on='US_DOT_AIRPORT_ID',
              how= 'left')
df.drop('US_DOT_AIRPORT_ID', axis=1, inplace=True)
df = df.rename(columns = {'US_DOT_AIRPORT_CODE':'PRIOR_AIRPORT_CODE'})

In [17]:
# join weather and BTS on prior airport and prior dep time
df = pd.merge(df, prior_weather, how='left',
              left_on=['PRIOR_AIRPORT_CODE', 'PRIOR_CRS_DEP_DATEHOUR'],
              right_on=['Airport', 'DateHour'])

In [18]:
# Remove rows that have no weather (will drop any dest with no weather)
df.dropna(subset=['Airport'], inplace=True)
df.shape

(46914, 120)

## Join Airplane Data

In [19]:
# import airplane info
tail_match = pd.read_csv('../Assets/Datasets/FlightTracker_Airplane/FA_Airplane_Info_Clean.csv')
tail_match.drop('Unnamed: 0', axis=1, inplace=True)

In [20]:
# left join data with tail match
df = pd.merge(df, tail_match, left_on='TAIL_NUM', right_on='N-NUMBER', how='left')

In [21]:
# Drop data that has no airplane match
df.dropna(subset=['mfr'], inplace=True)
df.shape

(46620, 125)

## Prepare Data

In [22]:
# Change Prior Airport from ID to Code
# format BTS data to match weather columns for join
df = pd.merge(df, code_match[['US_DOT_AIRPORT_ID', 'US_DOT_AIRPORT_CODE']], 
              left_on='PRIOR_AIRPORT', right_on='US_DOT_AIRPORT_ID',
              how= 'left')
df.drop('US_DOT_AIRPORT_ID', axis=1, inplace=True)
df = df.rename(columns = {'US_DOT_AIRPORT_CODE':'PRIOR_AIRPORT_CODE'})

In [38]:
# # Find cutoff for .5% longest delays (210 minutes)
# delays = pd.DataFrame(df['ARR_DELAY_NEW'].value_counts()).reset_index().sort_values('index')
# cumsum = delays.cumsum()
# cumsum.columns = ['delay_time', 'Sum']
# delays = delays.join(cumsum['Sum'])
# delays.columns = ['Delay_Time', 'Count', 'Sum']
# delays['pct'] = delays['Sum'] / df.shape[0]
# delays[delays['Delay_Time'] == 0]

# Cap Delays at 210 minutes (top .5% of delays)
df['ARR_DELAY_NEW'] = df['ARR_DELAY_NEW'].apply(lambda x: 210 if x > 210 else x)

Unnamed: 0,Delay_Time,Count,Sum,pct
0,0.0,26156,26156.0,0.561047


In [24]:
# Cap Delta_Time_min at 360 minutes. should cover almost all delta times 
df['DELTA_TIME_MIN'] = df['DELTA_TIME_MIN'].apply(lambda x: 360 if x > 360 else x)

In [25]:
# Create categorical delay
df['DEP_DELAY_C'] = df['DEP_DELAY_NEW'].apply(lambda x: 0 if x < 1 else 1)

In [27]:
# Select Columns necessary for model
data = df[[
           'ORIGIN_AIRPORT_CODE','DAILY_DEPARTURES', 'HOURLY_DEPARTURES', 'UNIQUE_CARRIER', 'DEST_AIRPORT_ID', 
           'DELTA_TIME_MIN','CRS_DEP_YEAR','CRS_DEP_WEEK','CRS_DEP_DAY_OF_WEEK','CRS_DEP_TIME_hours',
           'Orig_Temp','Orig_Visibility','Orig_Wind Speed','Orig_Precip',
           'B1_Orig_Temp','B1_Orig_Visibility','B1_Orig_Wind Speed','B1_Orig_Precip',
           'B2_Orig_Temp','B2_Orig_Visibility','B2_Orig_Wind Speed','B2_Orig_Precip',
           'B3_Orig_Temp','B3_Orig_Visibility','B3_Orig_Wind Speed','B3_Orig_Precip',
           'A1_Orig_Temp','A1_Orig_Visibility','A1_Orig_Wind Speed','A1_Orig_Precip',
           'A2_Orig_Temp','A2_Orig_Visibility','A2_Orig_Wind Speed','A2_Orig_Precip',
#            'Dest_Temp','Dest_Visibility','Dest_Wind Speed','Dest_Precip',
#            'B1_Dest_Temp','B1_Dest_Visibility','B1_Dest_Wind Speed','B1_Dest_Precip',
#            'B2_Dest_Temp','B2_Dest_Visibility','B2_Dest_Wind Speed','B2_Dest_Precip',
#            'B3_Dest_Temp','B3_Dest_Visibility','B3_Dest_Wind Speed','B3_Dest_Precip',
#            'A1_Dest_Temp','A1_Dest_Visibility','A1_Dest_Wind Speed','A1_Dest_Precip',
#            'A2_Dest_Temp','A2_Dest_Visibility','A2_Dest_Wind Speed','A2_Dest_Precip',
           'Prior_Temp','Prior_Visibility','Prior_Wind Speed','Prior_Precip',
           'B1_Prior_Temp','B1_Prior_Visibility','B1_Prior_Wind Speed','B1_Prior_Precip',
           'B2_Prior_Temp','B2_Prior_Visibility','B2_Prior_Wind Speed','B2_Prior_Precip',
           'B3_Prior_Temp','B3_Prior_Visibility','B3_Prior_Wind Speed','B3_Prior_Precip',
           'A1_Prior_Temp','A1_Prior_Visibility','A1_Prior_Wind Speed','A1_Prior_Precip',
           'A2_Prior_Temp','A2_Prior_Visibility','A2_Prior_Wind Speed','A2_Prior_Precip',
           'Model', 'mfr_year',
            # 'PRIOR_AIRPORT_CODE', 'DEST_AIRPORT_CODE
           'DEP_DELAY_C'
          ]]

In [28]:
def clean_names(lst):
    new_lst = []
    for item in lst:
        item = item.lower()
        item = item.replace(' ', '_')
        new_lst.append(item)
    return new_lst

data.columns = clean_names(list(data.columns))

In [29]:
# Random Forest Create Dummies
#to_str = ['crs_dep_day_of_week', 'crs_dep_time_hours', 'crs_dep_week', 'crs_dep_year']
# for item in to_str:
#     data.loc[:, item] = data[item].astype(str)

data = pd.get_dummies(data, 
               columns=[
                        'unique_carrier', 'dest_airport_id', 'model'
                       ], 
               drop_first=True)


In [30]:
# Split data by airport
mdw = data[data['origin_airport_code'] == 'MDW']
ohr = data[data['origin_airport_code'] == 'ORD']

In [31]:
# Create train & test split
from sklearn.model_selection import train_test_split
mdw_x = mdw.drop(['origin_airport_code', 'dep_delay_c'], axis=1)
mdw_y = mdw['dep_delay_c']

mdw_x_train, mdw_x_test, mdw_y_train, mdw_y_test = train_test_split(mdw_x, mdw_y, train_size=.7)
print 'mdw: {}'.format(mdw_x_train.shape)

ohr_x = ohr.drop(['origin_airport_code', 'dep_delay_c'], axis=1)
ohr_y = ohr['dep_delay_c']

ohr_x_train, ohr_x_test, ohr_y_train, ohr_y_test = train_test_split(ohr_x, ohr_y, train_size=.7)
print 'ohr {}'.format(ohr_x_train.shape)

mdw: (13908, 187)
ohr (18725, 187)


In [None]:
# # Low features model
# data = df[['crs_dep_hour', 'delta_time_min', 'origin_airport_id', 'unique_carrier', 'week', 'temp', 'dep_delay_c']]

# data = pd.get_dummies(data, 
#                columns=['crs_dep_hour', 'origin_airport_id', 'unique_carrier', 'week'])

In [None]:
# # KNN/Neural Create dummy variables 
# to_str = ['day_of_week', 'prior_airport', 'origin_airport_id', 'dest_airport_id']
# for item in to_str:
#     data.loc[:, item] = data[item].astype(str)

# data = pd.get_dummies(data, 
#                columns=[
        
#                         'week', 'day_of_week',  
#                         'unique_carrier', 'prior_airport', 'origin_airport_id', 'dest_airport_id', 
#                         'model',  
#                        ], 
#                drop_first=True)


In [None]:
# # Create x and y and train test splits


# # Sample data
# sample = data.sample(frac=.2)

# x = sample.drop('dep_delay_c', axis=1)
# y = sample['dep_delay_c']

# x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.7)
# x_train.shape

In [None]:
# # Standardize for KNN or Neural
# # standardize
# from sklearn.preprocessing import StandardScaler
# ss = StandardScaler()

# ss.fit(x_train)
# x_train = ss.transform(x_train)
# x_test = ss.trasform(x_test)

## Feature Reduction

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca.fit(x_train)
print 'Total variance explained:\n{}\n'.format(sum(pca.explained_variance_ratio_))
print 'Variance breakdown:\n{}'.format(pca.explained_variance_ratio_)

In [None]:
x_train = pca.transform(x_train)
x_test = pca.transform(x_test)

### .66 is baseline of flights that are delayed <= 5 Min

## Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp = MLPClassifier()
mlp.fit(x_train, y_train)

In [None]:
mlp.score(x_train, y_train)

## Random Forest & KNN

In [32]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [33]:
#Random Forest Classifier MDW
clf = RandomForestClassifier(n_estimators = 100, min_samples_leaf=40,  n_jobs=-1)
clf.fit(mdw_x_train, mdw_y_train)
print 'Train Score: {}'.format(clf.score(mdw_x_train, mdw_y_train))
print 'Test Score: {}'.format(clf.score(mdw_x_test, mdw_y_test))

Train Score: 0.727279263733
Test Score: 0.704076497232


In [34]:
# Random Forest Feature importances MDW
f_imp = pd.DataFrame(zip(mdw_x_train.columns,clf.feature_importances_), columns=['Feature', 'Importance'])
f_imp.head(15).sort_values(['Importance'], ascending = False)

Unnamed: 0,Feature,Importance
6,crs_dep_time_hours,0.22576
2,delta_time_min,0.156953
3,crs_dep_year,0.028438
1,hourly_departures,0.024777
11,b1_orig_temp,0.016893
4,crs_dep_week,0.012458
7,orig_temp,0.01171
0,daily_departures,0.009839
13,b1_orig_wind_speed,0.008683
9,orig_wind_speed,0.007871


In [35]:
# Confusion Matrix MDW
predictions = clf.predict(mdw_x_train)
cm = confusion_matrix(mdw_y_train, predictions)
cm = pd.DataFrame(cm, columns=['no_delay', 'delay'], index= ['predict_no_delay', 'predict_delay'])
cm

Unnamed: 0,no_delay,delay
predict_no_delay,4971,2005
predict_delay,1788,5144


In [36]:
#Random Forest Classifier ORD
clf = RandomForestClassifier(n_estimators = 100, min_samples_leaf=50,  n_jobs=-1)
clf.fit(ohr_x_train, ohr_y_train)
print 'Train Score: {}'.format(clf.score(ohr_x_train, ohr_y_train))
print 'Test Score: {}'.format(clf.score(ohr_x_test, ohr_y_test))

Train Score: 0.693511348465
Test Score: 0.653501121356


In [37]:
# Random Forest Feature importances ORD
f_imp = pd.DataFrame(zip(ohr_x_train.columns,clf.feature_importances_), columns=['Feature', 'Importance'])
f_imp.head(15).sort_values(['Importance'], ascending = False)

Unnamed: 0,Feature,Importance
6,crs_dep_time_hours,0.159649
2,delta_time_min,0.106123
3,crs_dep_year,0.033632
1,hourly_departures,0.029207
7,orig_temp,0.026252
4,crs_dep_week,0.026245
11,b1_orig_temp,0.023741
0,daily_departures,0.022847
13,b1_orig_wind_speed,0.015853
9,orig_wind_speed,0.011138


In [39]:
# Confusion Matrix ORD
predictions = clf.predict(ohr_x_train)
cm = confusion_matrix(ohr_y_train, predictions)
cm = pd.DataFrame(cm, columns=['no_delay', 'delay'], index= ['predict_no_delay', 'predict_delay'])
cm

Unnamed: 0,no_delay,delay
predict_no_delay,5282,3598
predict_delay,2141,7704


In [None]:
# KNN
clf = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
clf.fit(x_train, y_train)
print 'Train Score: {}'.format(clf.score(x_train, y_train))
print 'Test Score: {}'.format(clf.score(x_test, y_test))

In [None]:
# Extra Trees Classifier
clf = ExtraTreesClassifier(n_estimators = 100, min_samples_leaf=50,  n_jobs=-1)
clf.fit(x_train, y_train)
print 'Train Score: {}'.format(clf.score(x_train, y_train))
print 'Test Score: {}'.format(clf.score(x_test, y_test))

In [None]:
# Classification Report
cr = classification_report(y_train, predictions)
print cr

In [None]:
## WIP -- to model likely prior airport
prior_group = df[df['YEAR'] == 2016].groupby(['YEAR', 'ORIGIN_AIRPORT_ID', 'UNIQUE_CARRIER', 'FL_NUM']).agg(
                                                {'PRIOR_AIRPORT': pd.Series.nunique,
                                                'DEST_AIRPORT_ID' : pd.Series.nunique}).reset_index()

prior_group['PRIOR_AIRPORT'].value_counts()
# prior_group['FL_NUM'] = prior_group['FL_NUM'].astype(str)
# prior_group['Route'] = prior_group['UNIQUE_CARRIER'] + prior_group['FL_NUM']

# sns.barplot(x='Route', y='PRIOR_AIRPORT', data=prior_group)