In [62]:
% matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.cross_validation import train_test_split
pd.options.display.max_columns = 50

In [63]:
df = pd.read_csv('../Assets/Datasets/BTS_flight_data/Processed/chicago_departures.csv')

print df.shape

(2038320, 42)


In [64]:
df.columns

Index([u'YEAR', u'MONTH', u'DAY_OF_MONTH', u'DAY_OF_WEEK', u'FL_DATE',
       u'UNIQUE_CARRIER', u'FL_NUM', u'ORIGIN_AIRPORT_ID',
       u'ORIGIN_CITY_MARKET_ID', u'DEST_AIRPORT_ID', u'DEST_CITY_MARKET_ID',
       u'CRS_DEP_TIME', u'DEP_TIME', u'DEP_DELAY_NEW', u'TAXI_OUT',
       u'WHEELS_OFF', u'WHEELS_ON', u'TAXI_IN', u'CRS_ARR_TIME', u'ARR_TIME',
       u'ARR_DELAY_NEW', u'CANCELLED', u'CANCELLATION_CODE', u'DIVERTED',
       u'CRS_ELAPSED_TIME', u'ACTUAL_ELAPSED_TIME', u'AIR_TIME', u'FLIGHTS',
       u'DISTANCE', u'CARRIER_DELAY', u'WEATHER_DELAY', u'NAS_DELAY',
       u'SECURITY_DELAY', u'LATE_AIRCRAFT_DELAY', u'CRS_FL_TIME',
       u'CRS_FL_DATETIME', u'FL_TIME', u'FL_DATETIME', u'TAIL_NUM', u'WEEK',
       u'PRIOR_AIRPORT', u'DELTA_TIME_MIN'],
      dtype='object')

## Join Weather Data

In [50]:
# Import and format weather data
weather = pd.read_csv('../Assets/Datasets/Weather/Clean_Weather_Hourly.csv')
weather.drop('Unnamed: 0', axis=1, inplace=True)
weather = weather[['Airport', 'Date', 'Hour', 'Daypart', 'Temp', 'Visibility', 'Wind Speed',
                   'Precip', 'Conditions']]
weather['Year'] = weather['Date'].str.split('-').apply(lambda x: x[0])
weather['Month'] = weather['Date'].str.split('-').apply(lambda x: x[1])
weather['Month'] = weather['Month'].apply(lambda x: x if len(x) == 2 else '0'+ x)
weather['Day'] = weather['Date'].str.split('-').apply(lambda x: x[2])
weather['Day'] = weather['Day'].apply(lambda x: x if len(x) == 2 else '0'+ x)
weather['Date'] = weather['Year'] + '-' + weather['Month'] + '-' + weather['Day']

In [29]:
# format BTS data to match weather columns for join
df['CRS_DEP_HOUR'] = df['CRS_DEP_TIME'] / 100
df['CRS_DEP_HOUR'] = df['CRS_DEP_HOUR'].astype(int)
df['AIRPORT_CODE'] = np.where(df['ORIGIN_AIRPORT_ID'] == 13930,
                              'KORD', 'KMDW')


In [None]:
# join weather and BTS
df = pd.merge(df, weather, how='left',
              left_on=['AIRPORT_CODE', 'FL_DATE', 'CRS_DEP_HOUR'],
              right_on=['Airport', 'Date', 'Hour'], )

## ISSUE: some dates appear to have not pulled from Wunderground

In [51]:
# Remove columns from join if it fails
# df.drop(['Airport', 'Date', 'Hour', 'Daypart', 'Temp', 'Visibility',
#       'Wind Speed', 'Precip', 'Conditions'], inplace=True, axis=1)

In [70]:
# Select Columns necessary for model
data = df[['YEAR', 'WEEK', 'DAY_OF_WEEK', 'UNIQUE_CARRIER', 'TAIL_NUM', 'FL_NUM', 'PRIOR_AIRPORT', 
           'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'CRS_DEP_TIME', 'DEP_TIME',
           'CRS_ARR_TIME', 'ARR_TIME','ARR_DELAY_NEW', ]]

## Join Weather Data

In [65]:
tail_match = pd.read_csv('../Assets/Datasets/FlightTracker_Airplane/FA_Airplane_Info_Clean.csv')
tail_match.drop('Unnamed: 0', axis=1, inplace=True)

Unnamed: 0,N-NUMBER,mfr,Model,mfr_year,seats


In [69]:
tail_match.head()

Unnamed: 0,N-NUMBER,mfr,Model,mfr_year,seats
0,N10156,EMBRAER,EMB-145,2004.0,55.0
1,N102UW,AIRBUS,A320,1998.0,182.0
2,N103SY,EMBRAER,S A ERJ 170-200 LR,2014.0,88.0
3,N103US,AIRBUS,A320,1999.0,182.0
4,N104UA,BOEING,747,1998.0,495.0


In [71]:
# left join data with tail match
data = pd.merge(data, tail_match, left_on='TAIL_NUM', right_on='N-NUMBER', how='left')

In [74]:
# 3.6% of departures have no tail match
# a lot of the matches are because there isn't a year
74707/2038320.0

0.03665126182346246

In [75]:
no_n = data[data['N-NUMBER'].isnull()]

In [77]:
no_n['TAIL_NUM'].value_counts()

N504AE    2048
N528EG    2014
NEGMQ     1981
N620AE    1894
N15574    1818
N512AE    1751
N621AE    1713
N14558    1523
N15555    1468
N18557    1444
N679SA    1337
N851AE    1321
N915SW    1218
N918SW    1168
N917SW    1132
N8303R     978
N466UA     926
N928WN     867
N299WN     855
N298WN     851
N927WN     848
N9405T     802
N767SW     799
N926WN     795
N986CA     788
N271LV     786
N8606C     756
N417SW     756
N3LCAA     732
N3LKAA     729
          ... 
N8APAA       7
N8ANAA       7
N850DN       6
N879AA       6
N835AW       6
N852AS       5
N889AA       5
N826AW       5
N834AW       4
N3NRAA       3
N846AS       3
N907EV       3
N8ASAA       3
N8ATAA       3
N935EV       2
N713AE       2
N896AA       2
N867AA       2
N7LNAA       2
N887AA       2
N883AA       2
N871AA       2
N872AA       1
N7LMAA       1
N880AA       1
N3NPAA       1
N934EV       1
N251AK       1
N906EV       1
N897AA       1
Name: TAIL_NUM, dtype: int64

In [78]:
no_n['TAIL_NUM'].nunique()

255

## Prepare Data

In [None]:
x_train, y_train, x_test, y_test = train_test_split(df, test_size=.35, )

In [None]:
train_test_split?

In [None]:
df.dtypes

In [89]:
from datetime import date, datetime, timedelta
def date_range(start, end):
    dates = []
    curr = start
    while curr <= end:
        dates.append(curr)
        curr += timedelta(days=1)
    return dates

lst = []
for date in date_range(date(2011, 1, 1), date(2015, 1 ,26)):
    lst.append('{}-{}-{}'.format(date.year, date.month, date.day))

In [100]:
lst.columns = ['Date']

In [102]:

lst['Year'] = lst['Date'].str.split('-').apply(lambda x: x[0])
lst['Month'] = lst['Date'].str.split('-').apply(lambda x: x[1])
lst['Month'] = lst['Month'].apply(lambda x: x if len(x) == 2 else '0'+ x)
lst['Day'] = lst['Date'].str.split('-').apply(lambda x: x[2])
lst['Day'] = lst['Day'].apply(lambda x: x if len(x) == 2 else '0'+ x)
lst['Date'] = lst['Year'] + '-' + lst['Month'] + '-' + lst['Day']

In [103]:
lst

Unnamed: 0,Date,Year,Month,Day
0,2011-01-01,2011,01,01
1,2011-01-02,2011,01,02
2,2011-01-03,2011,01,03
3,2011-01-04,2011,01,04
4,2011-01-05,2011,01,05
5,2011-01-06,2011,01,06
6,2011-01-07,2011,01,07
7,2011-01-08,2011,01,08
8,2011-01-09,2011,01,09
9,2011-01-10,2011,01,10


In [115]:
columns = ['Time (CST)', 'Temp', 'Dew Point', 'Humidity', 
           'Pressure', 'Visibility', 'Wind Dir', 'Wind Speed', 'Gust Speed', 
           'Precip', 'Events', 'Conditions', 'Airport', 'Date']

In [138]:
weather = pd.read_csv('../Assets/Datasets/Weather/Weather_Hourly_20161251127.csv')


In [139]:
weather.columns = columns

In [140]:
weather2 = pd.read_csv('../Assets/Datasets/Weather/Weather_Hourly_20161251227.csv')


In [141]:
weather2.columns = columns

In [142]:
weather_combined = pd.concat([weather, weather2])

In [143]:
weather_combined.to_csv('../Assets/Datasets/Weather/Weather_Hourly_2016125129', header=False, index=False)