In [30]:
import os
import pandas as pd
import numpy as np

In [31]:
# locate all csv files in directory
directory = '../Assets/Datasets/BTS_flight_data/Raw'

file_names = []
for filename in os.listdir(directory):
    if filename.endswith(".csv"): 
        file_names.append(os.path.join(directory, filename))


In [32]:
# combine all csvs in pandas
dataframes = []
ct = 1
files = len(file_names)
for csv in file_names:
    dataframes.append(pd.read_csv(csv))
    ct += 1

df = pd.concat(dataframes)

In [33]:
# remove cancelled flights and diverted flights
df = df[(df['CANCELLED'] == 0) & (df['DIVERTED'] == 0)]

In [5]:
# 2400 isn't a real time, replace with 2359. Remove decimal points
columns = ['DEP_TIME', 'ARR_TIME', 'CRS_DEP_TIME', 'CRS_ARR_TIME']
for column in columns:
    df[column] = df[column].astype(str).str.replace('2400', '2359')
    df[column] = df[column].str.split('.').apply(lambda x: x[0])

In [6]:
times = [['CRS_FL_TIME', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_FL_DATETIME'],
         ['FL_TIME', 'DEP_TIME', 'ARR_TIME', 'FL_DATETIME']]

for time in times:
    # scheduled and actual Departure/Arrival time for ohare and midway
    df.loc[:, time[0]] =  np.where(((df['ORIGIN_AIRPORT_ID'] == 13930) | (df['ORIGIN_AIRPORT_ID'] == 13232)),
                                        df[time[1]], df[time[2]])

    # adjust time to string in proper format
    df[time[0]] = df[time[0]].astype(int).astype(str)
    df[time[0]] = df[time[0]].apply(lambda x: x[0:-2] + ':' + x[-2:] 
                                                if len(x) > 2
                                                else '0:' + x) 

    # concatenate date & time and change to datetime type
    df[time[3]] = df['FL_DATE'] + ' ' + df[time[0]]
    df[time[3]] = pd.to_datetime(df[time[3]], infer_datetime_format = True)

In [7]:
# Remove null TAIL_NUM rows
df.dropna(subset=['TAIL_NUM'], inplace=True)
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)

In [8]:
# remove departures with a non-n tail number
df = df[df['TAIL_NUM'].str[0] == 'N']

In [9]:
# Adjust TAIL_NUM to fit FAA specs
## An N-number can be in any of these formats
## One to five digits (N12345)
## One to four digits followed by one letter (N1234Z)
## One to three digits followed by two letters (N123AZ)
## N-numbers do not have
## A zero (0) as the first number
## The letters "I" or "O"

def remove_leading_0(n_num):
    if n_num[1] != '0':
        return 'N'+ n_num[1:]     
    else:
        return remove_leading_0(n_num[1:])

df['TAIL_NUM'] = df.loc[:, 'TAIL_NUM'].apply(remove_leading_0)

In [10]:
# get list of all tail numbers
tails = df['TAIL_NUM'].unique()
tails = pd.DataFrame(tails)
tails.columns = ['TailNum']

In [11]:
# Create fixed tail num match table (some tail numbers in BTS data are incorrect)
match = pd.read_csv('../Assets/Datasets/n_num_match/n_num_match.csv')
match = match[['TailNum', 'TailNum_fixed']]

tails = pd.merge(tails, match, on='TailNum', how='left')
tails.fillna('Missing', inplace=True)
tails['TailNum_fixed'] = np.where(tails['TailNum_fixed'] == 'Missing', tails['TailNum'], tails['TailNum_fixed'])
tails.drop_duplicates(inplace=True)

In [12]:
# Join Fixed tails to df
df = pd.merge(df, tails, how='left', left_on='TAIL_NUM', right_on='TailNum')
df.drop(['TailNum', 'TAIL_NUM'], axis=1, inplace=True)
df.columns = [u'YEAR', u'MONTH', u'DAY_OF_MONTH', u'DAY_OF_WEEK', u'FL_DATE',
       u'UNIQUE_CARRIER', u'FL_NUM', u'ORIGIN_AIRPORT_ID',
       u'ORIGIN_CITY_MARKET_ID', u'DEST_AIRPORT_ID', u'DEST_CITY_MARKET_ID',
       u'CRS_DEP_TIME', u'DEP_TIME', u'DEP_DELAY_NEW', u'TAXI_OUT',
       u'WHEELS_OFF', u'WHEELS_ON', u'TAXI_IN', u'CRS_ARR_TIME', u'ARR_TIME',
       u'ARR_DELAY_NEW', u'CANCELLED', u'CANCELLATION_CODE', u'DIVERTED',
       u'CRS_ELAPSED_TIME', u'ACTUAL_ELAPSED_TIME', u'AIR_TIME', u'FLIGHTS',
       u'DISTANCE', u'CARRIER_DELAY', u'WEATHER_DELAY', u'NAS_DELAY',
       u'SECURITY_DELAY', u'LATE_AIRCRAFT_DELAY', u'Unnamed: 35',
       u'CRS_FL_TIME', u'CRS_FL_DATETIME', u'FL_TIME', u'FL_DATETIME',
       u'TAIL_NUM']


In [16]:
# Replace CANCELLATION CODE nulls with 'N'
df['CANCELLATION_CODE'].fillna('N', inplace=True)

In [17]:
# Create week column
df['WEEK'] = df['FL_DATETIME'].apply(lambda x: x.week)

In [18]:
# Find prior airport and scheduled time dif beteween landing in chicago and departing
df = df.sort_values(['TAIL_NUM', 'FL_DATETIME'])
df_2 = df[['TAIL_NUM', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'FL_DATETIME', 'CRS_FL_DATETIME']]

In [19]:
# Assigns prior column by shifting one row back
df_2.loc[:, 'PRIOR_DEST_ID'] = df_2['DEST_AIRPORT_ID'].shift()
df_2.loc[:, 'PRIOR_ORIG_ID'] = df_2['ORIGIN_AIRPORT_ID'].shift()
df_2.loc[:, 'PRIOR_TAIL_NUM'] = df_2['TAIL_NUM'].shift()
df_2.loc[:, 'PRIOR_FL_DATETIME'] = df_2['FL_DATETIME'].shift()
df_2.loc[:, 'PRIOR_CRS_FL_DATETIME'] = df_2['CRS_FL_DATETIME'].shift()
df_2.loc[:, 'PRIOR_AIRPORT'] = 0

# remove first row, has null priors
df_2 = df_2.iloc[1:, :]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [20]:
# Find difference between scheduled landing and taking off by tailnum
df_2['DELTA_TIME'] = df_2['CRS_FL_DATETIME'] - df_2['PRIOR_CRS_FL_DATETIME']
df_2['DELTA_TIME_MIN'] = (df_2['DELTA_TIME'].apply(lambda x: x.days *24*60) + 
                          df_2['DELTA_TIME'].apply(lambda x: x.seconds / 60))

In [21]:
# Add PRIOR_AIRPORT column
# takes a few min

df_2['PRIOR_AIRPORT'] = np.where(((df_2['ORIGIN_AIRPORT_ID'] == 13930) | (df_2['ORIGIN_AIRPORT_ID'] == 13232)) &
                                 (df_2['PRIOR_DEST_ID'] == df_2['ORIGIN_AIRPORT_ID']) &
                                 (df_2['TAIL_NUM'] == df_2['PRIOR_TAIL_NUM']), 
                                 df_2['PRIOR_ORIG_ID'], 0)

In [22]:
df_pa = pd.DataFrame(df_2[['PRIOR_AIRPORT', 'DELTA_TIME_MIN']])
df = df.join(df_pa, how='left')

In [24]:
# remove departures with no PRIOR_AIRPORT (83515 rows)
df = df[df['PRIOR_AIRPORT'] != 0]
df.shape

(2038321, 43)

In [25]:
df.drop('Unnamed: 35', axis=1, inplace=True)

In [28]:
# Departures from MDW & ORD only
df = df[df['ORIGIN_AIRPORT_ID'].isin([13930, 13232])]
df.shape

(2038320, 42)

In [29]:
# Create MDW & ORD departures
df.to_csv('../Assets/Datasets/BTS_flight_data/Processed/chicago_departures.csv', index=False)