In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
df = pd.read_csv('../Assets/Datasets/BTS_flight_data/Processed/raw_combined.csv')

In [3]:
df = df[(df['ORIGIN_AIRPORT_ID'] == 13930) | (df['ORIGIN_AIRPORT_ID'] == 13232)]

In [4]:
departures = df[['FL_DATE', 'DAY_OF_WEEK', 'UNIQUE_CARRIER', 'TAIL_NUM', 'ORIGIN_AIRPORT_ID', 
                 'DEST_AIRPORT_ID', 'CRS_DEP_TIME', 'DEP_DELAY_NEW', 'CANCELLED', 'DIVERTED']]
departures.shape

(2164076, 10)

In [5]:
departures['Status'] = np.where(departures['CANCELLED'] == 1, 'Cancelled', 
                                np.where(departures['DIVERTED'] == 1, 'Diverted',
                                         np.where(departures['DEP_DELAY_NEW'] > 0, 'Delayed', 'On-Time')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [6]:
departures['Status'].value_counts()

On-Time      1102583
Delayed       996703
Cancelled      59450
Diverted        5340
Name: Status, dtype: int64

In [7]:
code_match = pd.read_csv('../Assets/Datasets/BTS_flight_data/Match_Tables/AIRPORT_ID_CODE.csv')
departures = pd.merge(departures, code_match[['US_DOT_AIRPORT_ID', 'US_DOT_AIRPORT_CODE']], 
              left_on='ORIGIN_AIRPORT_ID', right_on='US_DOT_AIRPORT_ID',
              how= 'left')
departures.drop('US_DOT_AIRPORT_ID', axis=1, inplace=True)
departures = departures.rename(columns = {'US_DOT_AIRPORT_CODE':'ORIGIN_AIRPORT_CODE'})

departures = pd.merge(departures, code_match[['US_DOT_AIRPORT_ID', 'US_DOT_AIRPORT_CODE']], 
              left_on='DEST_AIRPORT_ID', right_on='US_DOT_AIRPORT_ID',
              how= 'left')
departures.drop('US_DOT_AIRPORT_ID', axis=1, inplace=True)
departures = departures.rename(columns = {'US_DOT_AIRPORT_CODE':'DEST_AIRPORT_CODE'})

In [None]:
departures.isnull().sum()

In [8]:
carrier_match = pd.read_csv('../Assets/Datasets/BTS_flight_data/Match_Tables/L_UNIQUE_CARRIERS.csv')

departures = pd.merge(departures, carrier_match, 
                      left_on='UNIQUE_CARRIER', right_on='Code',
                      how='left')

departures.drop(['UNIQUE_CARRIER', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'Code'], axis=1, inplace=True)
departures = departures.rename(columns = {'Description' : 'Airline'})

In [9]:
# Replace Tail Num of cancelled flights without TN
departures['TAIL_NUM'] = np.where((departures['TAIL_NUM'].isnull()) &  (departures['CANCELLED'] == 1),
                                  'NOTAIL', departures['TAIL_NUM'])

# remove departures with a non-n tail number (<100 flights)
departures = departures[departures['TAIL_NUM'].str[0] == 'N']

# Adjust TAIL_NUM to fit FAA specs
## An N-number can be in any of these formats
## One to five digits (N12345)
## One to four digits followed by one letter (N1234Z)
## One to three digits followed by two letters (N123AZ)
## N-numbers do not have
## A zero (0) as the first number
## The letters "I" or "O"

def remove_leading_0(n_num):
    if n_num[1] != '0':
        return 'N'+ n_num[1:]     
    else:
        return remove_leading_0(n_num[1:])

departures['TAIL_NUM'] = departures.loc[:, 'TAIL_NUM'].apply(remove_leading_0)

# get list of all tail numbers
tails = departures['TAIL_NUM'].unique()
tails = pd.DataFrame(tails)
tails.columns = ['TailNum']

# Create fixed tail num match table (some tail numbers in BTS data are incorrect)
match = pd.read_csv('../Assets/Datasets/n_num_match/n_num_match.csv')
match = match[['TailNum', 'TailNum_fixed']]

tails = pd.merge(tails, match, on='TailNum', how='left')
tails.fillna('Missing', inplace=True)
tails['TailNum_fixed'] = np.where(tails['TailNum_fixed'] == 'Missing', tails['TailNum'], tails['TailNum_fixed'])
tails.drop_duplicates(inplace=True)

# Join Fixed tails to departures
departures = pd.merge(departures, tails, how='left', left_on='TAIL_NUM', right_on='TailNum')
departures.drop(['TailNum', 'TAIL_NUM'], axis=1, inplace=True)
departures = departures.rename(columns = {'TailNum_fixed' : 'TAIL_NUM'})

# import airplane info
tail_match = pd.read_csv('../Assets/Datasets/FlightTracker_Airplane/FA_Airplane_Info_Clean.csv')
tail_match.drop('Unnamed: 0', axis=1, inplace=True)

# left join data with tail match
departures = pd.merge(departures, tail_match, left_on='TAIL_NUM', right_on='N-NUMBER', how='left')

departures.drop('TAIL_NUM', axis=1, inplace=True)
departures = departures.rename(columns = {'N-NUMBER' : 'TAIL_NUM'})

In [10]:
departures['Route'] = departures['ORIGIN_AIRPORT_CODE'] + ' - ' + departures['DEST_AIRPORT_CODE']

In [11]:
# Add datetime and hour
departures['FL_DATE'] = pd.to_datetime(departures['FL_DATE'], infer_datetime_format=True)
new_df = pd.DataFrame()
new_df['Year'] = departures['FL_DATE'].apply(lambda x: x.year)
new_df['Month'] = departures['FL_DATE'].apply(lambda x: x.month)
new_df['Day'] = departures['FL_DATE'].apply(lambda x: x.day)
new_df['Hour'] = departures['CRS_DEP_TIME']/100
new_df['Hour'] = new_df['Hour'].astype(int)
new_df['Minute'] = departures['CRS_DEP_TIME']%100
new_df['CRS_DEP_DATETIME'] = new_df.apply(lambda s: datetime(*s),axis = 1)
departures = departures.join(new_df[['CRS_DEP_DATETIME', 'Hour', 'Year']])


In [12]:
actives = departures.groupby(['Year', 'ORIGIN_AIRPORT_CODE', 'Airline'])['CRS_DEP_DATETIME'].count().reset_index()
actives_2016 = actives[actives['Year'] == 2016][['ORIGIN_AIRPORT_CODE', 'Airline']]
actives_2016['Active'] = 'Active'

In [13]:
departures = pd.merge(departures, actives_2016, on=['ORIGIN_AIRPORT_CODE', 'Airline'], how='left')
departures = departures.fillna(value=({'Active' : 'Inactive'}))

In [14]:
# Add Airport Location Data and Name
columns = ['Airport ID', 'Name', 'City', 'Country', 'IATA/FAA', 'ICAO', 'Latitude', 'Longitude', 'Altitude',
           'Timezone', 'DST', 'Tz']
airport_match = pd.read_csv('../Assets/Datasets/Open_Flights/airports_match.csv', names=columns)

departures = pd.merge(departures, airport_match[['IATA/FAA', 'Name', 'Latitude', 'Longitude']],
                     left_on='ORIGIN_AIRPORT_CODE', right_on='IATA/FAA', how='left')
departures = departures.rename(columns = {'Name' : 'Orig_Name'})
departures = pd.merge(departures, airport_match[['IATA/FAA', 'Name', 'Latitude', 'Longitude']],
                     left_on='DEST_AIRPORT_CODE', right_on='IATA/FAA', how='left')
departures.drop(['IATA/FAA_x', 'IATA/FAA_y'], axis=1, inplace=True)
departures = departures.rename(columns = {'Name' : 'Dest_Name',
                                          'Latitude_x' : 'Latitude_O',
                                          'Longitude_x' : 'Longitude_O',
                                          'Latitude_y' : 'Latitude_D',
                                          'Longitude_y' : 'Longitude_D'})

In [15]:
departures.drop('FL_DATE', axis=1, inplace=True)

In [16]:
departures.to_csv('../Assets/Datasets/Outputs/chi_dep_viz.csv')

In [None]:
departures.columns

In [None]:
departures.groupby([''])