In [1]:
% matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.cross_validation import train_test_split
pd.options.display.max_columns = 50



In [2]:
df = pd.read_csv('../Assets/Datasets/BTS_flight_data/Processed/chicago_departures.csv')

print df.shape

(2038320, 42)


## Join Weather Data

In [4]:
# Import and format weather data
weather = pd.read_csv('../Assets/Datasets/Weather/Clean_Weather_Hourly.csv')
weather = weather[['Airport', 'Date', 'Hour', 'Daypart', 'Temp', 'Visibility', 'Wind Speed',
                   'Precip', 'Conditions']]
weather['Year'] = weather['Date'].str.split('-').apply(lambda x: x[0])
weather['Month'] = weather['Date'].str.split('-').apply(lambda x: x[1])
weather['Month'] = weather['Month'].apply(lambda x: x if len(x) == 2 else '0'+ x)
weather['Day'] = weather['Date'].str.split('-').apply(lambda x: x[2])
weather['Day'] = weather['Day'].apply(lambda x: x if len(x) == 2 else '0'+ x)
weather['Date'] = weather['Year'] + '-' + weather['Month'] + '-' + weather['Day']

In [5]:
# format BTS data to match weather columns for join
df['CRS_DEP_HOUR'] = df['CRS_DEP_TIME'] / 100
df['CRS_DEP_HOUR'] = df['CRS_DEP_HOUR'].astype(int)
df['AIRPORT_CODE'] = np.where(df['ORIGIN_AIRPORT_ID'] == 13930,
                              'KORD', 'KMDW')


In [6]:
# join weather and BTS
df = pd.merge(df, weather, how='left',
              left_on=['AIRPORT_CODE', 'FL_DATE', 'CRS_DEP_HOUR'],
              right_on=['Airport', 'Date', 'Hour'], )

In [7]:
# Remove rows that have no weather info (5,207)
df.dropna(subset=['Airport'], inplace=True)

## Join Airplane Data

In [8]:
# import airplane info
tail_match = pd.read_csv('../Assets/Datasets/FlightTracker_Airplane/FA_Airplane_Info_Clean.csv')
tail_match.drop('Unnamed: 0', axis=1, inplace=True)

In [9]:
# left join data with tail match
df = pd.merge(df, tail_match, left_on='TAIL_NUM', right_on='N-NUMBER', how='left')

In [10]:
# Drop data that has no airplane match (1.2% of data)
df.dropna(subset=['mfr'], inplace=True)

## Prepare Data

In [13]:
# Select Columns necessary for model
data = df[['YEAR', 'WEEK', 'DAY_OF_WEEK', 'Daypart', 'CRS_DEP_HOUR', 'UNIQUE_CARRIER', 
           'TAIL_NUM', 'FL_NUM', 'PRIOR_AIRPORT', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID',
           'CRS_DEP_TIME', 'DEP_TIME', 'CRS_FL_DATETIME', 'CRS_ARR_TIME', 'ARR_TIME','ARR_DELAY_NEW',
           'DELTA_TIME_MIN', 'mfr', 'Model', 'mfr_year', 'seats', 'Temp', 'Visibility',
           'Wind Speed', 'Precip', 'Conditions']]

In [44]:
df.columns

Index([u'YEAR', u'MONTH', u'DAY_OF_MONTH', u'DAY_OF_WEEK', u'FL_DATE',
       u'UNIQUE_CARRIER', u'FL_NUM', u'ORIGIN_AIRPORT_ID',
       u'ORIGIN_CITY_MARKET_ID', u'DEST_AIRPORT_ID', u'DEST_CITY_MARKET_ID',
       u'CRS_DEP_TIME', u'DEP_TIME', u'DEP_DELAY_NEW', u'TAXI_OUT',
       u'WHEELS_OFF', u'WHEELS_ON', u'TAXI_IN', u'CRS_ARR_TIME', u'ARR_TIME',
       u'ARR_DELAY_NEW', u'CANCELLED', u'CANCELLATION_CODE', u'DIVERTED',
       u'CRS_ELAPSED_TIME', u'ACTUAL_ELAPSED_TIME', u'AIR_TIME', u'FLIGHTS',
       u'DISTANCE', u'CARRIER_DELAY', u'WEATHER_DELAY', u'NAS_DELAY',
       u'SECURITY_DELAY', u'LATE_AIRCRAFT_DELAY', u'CRS_FL_TIME',
       u'CRS_FL_DATETIME', u'FL_TIME', u'FL_DATETIME', u'TAIL_NUM', u'WEEK',
       u'PRIOR_AIRPORT', u'DELTA_TIME_MIN', u'CRS_DEP_HOUR', u'AIRPORT_CODE',
       u'Airport', u'Date', u'Hour', u'Daypart', u'Temp', u'Visibility',
       u'Wind Speed', u'Precip', u'Conditions', u'Year', u'Month', u'Day',
       u'N-NUMBER', u'mfr', u'Model', u'mfr_year', u'seat

In [53]:
df.groupby(['YEAR', 'ORIGIN_AIRPORT_ID', 'UNIQUE_CARRIER', 'FL_NUM']).agg(
    {'PRIOR_AIRPORT': pd.Series.nunique,
    'DEST_AIRPORT_ID' : pd.Series.nunique})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,PRIOR_AIRPORT,DEST_AIRPORT_ID
YEAR,ORIGIN_AIRPORT_ID,UNIQUE_CARRIER,FL_NUM,Unnamed: 4_level_1,Unnamed: 5_level_1
2011,13232,DL,393,1.0,1
2011,13232,DL,431,1.0,1
2011,13232,DL,432,1.0,1
2011,13232,DL,651,1.0,1
2011,13232,DL,742,1.0,1
2011,13232,DL,762,1.0,1
2011,13232,DL,799,1.0,1
2011,13232,DL,843,1.0,1
2011,13232,DL,895,1.0,1
2011,13232,DL,1029,1.0,1
