In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_pickle('FinalData.pkl')

Next step is for us to remove the last 4 characters on the city columns. Those 4 characters refer to the state they belong to, but that information is also stored in the adjacent columns, so we would like to get the clean name of the city in order to join the table with another table, where we will obtain the city population.

In [3]:
for i in tqdm(range(len(df))):
    df['FCity'].iloc[i] = df['FCity'].iloc[i][:-4]
    df['TCity'].iloc[i] = df['TCity'].iloc[i][:-4]

100%|██████████| 118319/118319 [05:35<00:00, 352.17it/s]


In [4]:
df

Unnamed: 0,Route,Month,Carrier,From,FCity,FST,FShare,FGDPpc,To,TCity,TST,TShare,TGDPpc,Delay,Flights,Dist
0,ABE-ATL,1,9E,ABE,Allentown/Bethlehem/Easton,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,7.073171,41,692
1,ABE-ATL,1,DL,ABE,Allentown/Bethlehem/Easton,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,-0.923077,26,692
2,ABE-ATL,2,9E,ABE,Allentown/Bethlehem/Easton,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,20.382353,34,692
3,ABE-ATL,2,DL,ABE,Allentown/Bethlehem/Easton,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,1.964286,28,692
4,ABE-ATL,3,9E,ABE,Allentown/Bethlehem/Easton,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,4.929825,57,692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118314,YUM-PHX,10,YV,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,AZ,1.71,48.15,5.543860,57,160
118315,YUM-PHX,11,OO,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,AZ,1.71,48.15,4.470588,102,160
118316,YUM-PHX,11,YV,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,AZ,1.71,48.15,5.375000,40,160
118317,YUM-PHX,12,OO,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,AZ,1.71,48.15,-1.022727,88,160


Finally, we are going to replace all the city fields that contain more than one city name. There are some airports that belong to two different cities, and this will be an issue when we try to join the tables. For that sole reason, we require city fields to contain the name of only one city.

In [5]:
df.FCity.replace('New York', 'New York City', inplace = True)
df.FCity.replace('Newark', 'New York City', inplace = True)
df.FCity.replace('Dallas/Fort Worth', 'Dallas', inplace = True)
df.FCity.replace('Montrose/Delta', 'Montrose', inplace = True)
df.FCity.replace('Newburgh/Poughkeepsie', 'New York City', inplace = True)
df.FCity.replace('Gulfport/Biloxi', 'Gulfport', inplace = True)
df.FCity.replace('Allentown/Bethlehem/Easton', 'Allentown', inplace = True)
df.FCity.replace('Raleigh/Durham', 'Raleigh', inplace = True)
df.FCity.replace('Jacksonville/Camp Lejeune', 'Jacksonville', inplace = True)
df.TCity.replace('New York', 'New York City', inplace = True)
df.TCity.replace('Newark', 'New York City', inplace = True)
df.TCity.replace('Dallas/Fort Worth', 'Dallas', inplace = True)
df.TCity.replace('Montrose/Delta', 'Montrose', inplace = True)
df.TCity.replace('Newburgh/Poughkeepsie', 'New York City', inplace = True)
df.TCity.replace('Gulfport/Biloxi', 'Gulfport', inplace = True)
df.TCity.replace('Allentown/Bethlehem/Easton', 'Allentown', inplace = True)
df.TCity.replace('Raleigh/Durham', 'Raleigh', inplace = True)
df.TCity.replace('Jacksonville/Camp Lejeune', 'Jacksonville', inplace = True)

In [6]:
df

Unnamed: 0,Route,Month,Carrier,From,FCity,FST,FShare,FGDPpc,To,TCity,TST,TShare,TGDPpc,Delay,Flights,Dist
0,ABE-ATL,1,9E,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,7.073171,41,692
1,ABE-ATL,1,DL,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,-0.923077,26,692
2,ABE-ATL,2,9E,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,20.382353,34,692
3,ABE-ATL,2,DL,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,1.964286,28,692
4,ABE-ATL,3,9E,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,4.929825,57,692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118314,YUM-PHX,10,YV,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,AZ,1.71,48.15,5.543860,57,160
118315,YUM-PHX,11,OO,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,AZ,1.71,48.15,4.470588,102,160
118316,YUM-PHX,11,YV,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,AZ,1.71,48.15,5.375000,40,160
118317,YUM-PHX,12,OO,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,AZ,1.71,48.15,-1.022727,88,160


Finally, we are going to merge the city population into the main dataframe. 

We have information on the top 200 cities in the US, which have a population of 140.000 or more. For all those cities not included in this list of 200, we are going to assign a population of 100.000 as a default. This will not be too important as we will see later how is population taken into account when we perform the model.

Moreover, we are going to work with the population in millions. For this reason, we also perform the calculation, and it makes working with this numbers easier because of their smaller scale.

In [7]:
citydf = pd.read_excel('Data/PopData.xlsx')
citydf['Pop'] = round(citydf['Pop']/1000000, 2)
citydf

Unnamed: 0,City,Pop
0,New York City,8.32
1,Los Angeles,4.02
2,Chicago,2.69
3,Houston,2.34
4,Phoenix,1.70
...,...,...
195,Miramar,0.14
196,Bridgeport,0.14
197,Olathe,0.14
198,Denton,0.14


In [8]:
df = df.merge(citydf, left_on = 'FCity', right_on = 'City', how = 'left')
df = df.merge(citydf, left_on = 'TCity', right_on = 'City', how = 'left')

In [9]:
df = df.drop(['City_x', 'City_y'], axis = 1)

In [10]:
df = df.fillna(0.1)

In [11]:
df

Unnamed: 0,Route,Month,Carrier,From,FCity,FST,FShare,FGDPpc,To,TCity,TST,TShare,TGDPpc,Delay,Flights,Dist,Pop_x,Pop_y
0,ABE-ATL,1,9E,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,7.073171,41,692,0.1,0.52
1,ABE-ATL,1,DL,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,-0.923077,26,692,0.1,0.52
2,ABE-ATL,2,9E,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,20.382353,34,692,0.1,0.52
3,ABE-ATL,2,DL,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,1.964286,28,692,0.1,0.52
4,ABE-ATL,3,9E,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,GA,2.90,56.03,4.929825,57,692,0.1,0.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122568,YUM-PHX,10,YV,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,AZ,1.71,48.15,5.543860,57,160,0.1,1.70
122569,YUM-PHX,11,OO,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,AZ,1.71,48.15,4.470588,102,160,0.1,1.70
122570,YUM-PHX,11,YV,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,AZ,1.71,48.15,5.375000,40,160,0.1,1.70
122571,YUM-PHX,12,OO,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,AZ,1.71,48.15,-1.022727,88,160,0.1,1.70


### Defining dummy variables.
We are now going to define several dummy variables. Dummies are binary variables that take either the value 0 or the value 1 depending on whether the observation does or does not satisfy a condition. These dummies are, by definition, creating two categories that are assumed to have differences due to the fact of belonging to such categories.

The first important feature we want to store as a dummy variable is whether the route is departing from or arriving to one of the US hubs. Hubs are main airports where the airlines set their operational bases. It is impossible for any airline to perform all possible routes within any pair of airports in the US. However, companies usually make all their destinations connect to their hubs, where then passengers can transit into another flight of the same company that completes the desired journey. 

Due to its nature, hubs are airports with a huge load of incoming and departing flights, and so it seems safe to assume that the amount of flights in a route will be heavily determined by whether one of the two connected airports is a hub or not.

On the other hand, we also want to take a look at flights going to or departing from Hawaii and Alaska. Those two states are the ones which are not part of mainland US, and the only way to connect them to the rest of states is by an air connection. We can also assume that, given there are no alternatives to travel to or from those states, there is going to be a larger number of flights for such routes, and then, it will be a factor we should take into account. 

By the same logic, we can also create a feature that will respond to whether the route is connecting two cities within the same state. There are some routes that connect a pair of cities of the same state, and in this case, there may be other means of transportation that could be used other than planes. For this reason, we also believe this could be a factor that influences the amount of flights in a route, so we will also create such a dummy.

In [12]:
hubs = ['ATL', 'LAX', 'DFW', 'EWR', 'JFK', 'SFO', 'DEN', 'MIA', 'ORD', 'BOS', 'DAL', 'SEA']
away = ['HI', 'AK']

In [13]:
df['Hub'] = np.where(df['From'].isin(hubs)+df['To'].isin(hubs)>0, 1, 0)

In [14]:
df['Hub'].value_counts()

0    64271
1    58302
Name: Hub, dtype: int64

In [15]:
df['Away'] = np.where(df['FST'].isin(away)+df['TST'].isin(away)>0, 1, 0)

In [16]:
df['Away'].value_counts()

0    119019
1      3554
Name: Away, dtype: int64

In [17]:
df['Within'] = np.where(df['FST'] == df['TST'], 1, 0)

In [18]:
df

Unnamed: 0,Route,Month,Carrier,From,FCity,FST,FShare,FGDPpc,To,TCity,...,TShare,TGDPpc,Delay,Flights,Dist,Pop_x,Pop_y,Hub,Away,Within
0,ABE-ATL,1,9E,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,...,2.90,56.03,7.073171,41,692,0.1,0.52,1,0,0
1,ABE-ATL,1,DL,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,...,2.90,56.03,-0.923077,26,692,0.1,0.52,1,0,0
2,ABE-ATL,2,9E,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,...,2.90,56.03,20.382353,34,692,0.1,0.52,1,0,0
3,ABE-ATL,2,DL,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,...,2.90,56.03,1.964286,28,692,0.1,0.52,1,0,0
4,ABE-ATL,3,9E,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,...,2.90,56.03,4.929825,57,692,0.1,0.52,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122568,YUM-PHX,10,YV,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,...,1.71,48.15,5.543860,57,160,0.1,1.70,0,0,1
122569,YUM-PHX,11,OO,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,...,1.71,48.15,4.470588,102,160,0.1,1.70,0,0,1
122570,YUM-PHX,11,YV,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,...,1.71,48.15,5.375000,40,160,0.1,1.70,0,0,1
122571,YUM-PHX,12,OO,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,...,1.71,48.15,-1.022727,88,160,0.1,1.70,0,0,1


In [19]:
df['Delay'] = round(df['Delay'], 0)

In [20]:
df

Unnamed: 0,Route,Month,Carrier,From,FCity,FST,FShare,FGDPpc,To,TCity,...,TShare,TGDPpc,Delay,Flights,Dist,Pop_x,Pop_y,Hub,Away,Within
0,ABE-ATL,1,9E,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,...,2.90,56.03,7.0,41,692,0.1,0.52,1,0,0
1,ABE-ATL,1,DL,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,...,2.90,56.03,-1.0,26,692,0.1,0.52,1,0,0
2,ABE-ATL,2,9E,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,...,2.90,56.03,20.0,34,692,0.1,0.52,1,0,0
3,ABE-ATL,2,DL,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,...,2.90,56.03,2.0,28,692,0.1,0.52,1,0,0
4,ABE-ATL,3,9E,ABE,Allentown,PA,3.87,62.66,ATL,Atlanta,...,2.90,56.03,5.0,57,692,0.1,0.52,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122568,YUM-PHX,10,YV,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,...,1.71,48.15,6.0,57,160,0.1,1.70,0,0,1
122569,YUM-PHX,11,OO,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,...,1.71,48.15,4.0,102,160,0.1,1.70,0,0,1
122570,YUM-PHX,11,YV,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,...,1.71,48.15,5.0,40,160,0.1,1.70,0,0,1
122571,YUM-PHX,12,OO,YUM,Yuma,AZ,1.71,48.15,PHX,Phoenix,...,1.71,48.15,-1.0,88,160,0.1,1.70,0,0,1


In [21]:
df.to_csv('Flights19.csv')

In [22]:
df.to_pickle('Flights19.pkl')