### Baseline Model

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
!ls

2007.csv             baseline_model.ipynb df.csv
airports.csv         carriers.csv         planedata.csv


In [3]:
df = pd.read_csv('df.csv').drop('Unnamed: 0', axis=1)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df.shape

(6808770, 53)

Removing Data Leakage Attributes

In [5]:
df.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'Carrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'OriginAirport', 'OriginCity', 'OriginState', 'OriginCountry',
       'OriginLat', 'OriginLong', 'DestAirport', 'DestCity', 'DestState',
       'DestCountry', 'DestLat', 'DestLong', 'TailNum.1', 'PlaneOwnership',
       'Manufacturer', 'PlaneIssueData', 'PlaneModel', 'PlaneStatus',
       'AircraftType', 'Engine', 'PlaneYear', 'OriginCityIata', 'CarrierName',
       'FlightTimeStamp'],
      dtype='object')

In [6]:
df.drop('TailNum.1',axis=1,inplace=True)

In [7]:
df = df.rename(columns={'PlaneIssueData':'PlaneIssueDate'})

In [30]:
df = df[df['Cancelled']!=1]

In [8]:
df.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,Carrier,FlightNum,...,Manufacturer,PlaneIssueDate,PlaneModel,PlaneStatus,AircraftType,Engine,PlaneYear,OriginCityIata,CarrierName,FlightTimeStamp
0,2007,1,2,2,1736.0,1720,1844.0,1835,WN,248,...,BOEING,05/16/1996,737-3H4,Valid,Fixed Wing Multi-Engine,Turbo-Fan,1996,Sacramento-SMF,SouthWest,2007-1-2
1,2007,1,2,2,2046.0,2045,2151.0,2200,WN,1311,...,BOEING,05/16/1996,737-3H4,Valid,Fixed Wing Multi-Engine,Turbo-Fan,1996,Sacramento-SMF,SouthWest,2007-1-2
2,2007,1,2,2,1228.0,1225,1341.0,1340,WN,2891,...,BOEING,05/16/1996,737-3H4,Valid,Fixed Wing Multi-Engine,Turbo-Fan,1996,Sacramento-SMF,SouthWest,2007-1-2
3,2007,1,22,1,2155.0,2155,2252.0,2310,WN,2740,...,BOEING,05/16/1996,737-3H4,Valid,Fixed Wing Multi-Engine,Turbo-Fan,1996,Sacramento-SMF,SouthWest,2007-1-22
4,2007,4,9,1,2143.0,2140,2252.0,2255,WN,541,...,BOEING,05/16/1996,737-3H4,Valid,Fixed Wing Multi-Engine,Turbo-Fan,1996,Sacramento-SMF,SouthWest,2007-4-9


The following columns will have to be removed in order to avoid Data Leakage:
- ArrTime, DepTime, ActualElapsedTime, AirTime, ArrDelay, DepDelay, TaxiIn, TaxiOut
- 'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
- 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'

The following columns are duplicates of other columns:
- OriginCityIata

The following columns should be removed as they carry no info for a linear model:
- 'OriginLat', 'OriginLong', 'DestLat','DestLong', 'CRSDepTime', 'CRSArrTime'

In [9]:
#df = df.drop(columns=['ArrTime', 'DepTime', 'ActualElapsedTime', 'AirTime',
#                 'DepDelay', 'TaxiIn', 'TaxiOut','Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
#                 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'OriginCityIata', 
#                  'OriginLat', 'OriginLong', 'DestLat','DestLong', 'CRSDepTime', 'CRSArrTime'])

Feature Engineering

In [10]:
df.PlaneIssueDate.head()

0    05/16/1996
1    05/16/1996
2    05/16/1996
3    05/16/1996
4    05/16/1996
Name: PlaneIssueDate, dtype: object

In [11]:
df['PlaneIssueDay'] = df['PlaneIssueDate'].str[:2]
df['PlaneIssueMonth'] = df['PlaneIssueDate'].str[3:5]
df['PlaneIssueYear'] = df['PlaneIssueDate'].str[6:10]

In [12]:
df = df.drop('PlaneIssueDate', axis=1)

In [13]:
# y = 'ArrDelay' 

In [14]:
#Removing DestCountry because all flights are within the US
len(df['DestCountry'].unique())

1

In [15]:
df = df.drop('DestCountry', axis=1)

We have dropped all columns that should be dropped at this point. Next: Nulls

In [16]:
#Checking for nulls in each column:
df.isnull().sum()

Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime                    0
CRSDepTime                 0
ArrTime                    0
CRSArrTime                 0
Carrier                    0
FlightNum                  0
TailNum                    0
ActualElapsedTime          0
CRSElapsedTime             0
AirTime                    0
ArrDelay                   0
DepDelay                   0
Origin                     0
Dest                       0
Distance                   0
TaxiIn                     0
TaxiOut                    0
Cancelled                  0
CancellationCode     6808769
Diverted                   0
CarrierDelay               0
WeatherDelay               0
NASDelay                   0
SecurityDelay              0
LateAircraftDelay          0
OriginAirport              0
OriginCity              4891
OriginState             4891
OriginCountry              0
OriginLat                  0
OriginLong    

In [17]:
df[df.OriginCity.isnull()]['Origin'].unique()
#df[df.DestCity.isnull()]['Dest'].unique()

array(['CLD', 'MQT', 'SCE', 'HHH'], dtype=object)

In [18]:
df.DestCity.loc[df.Dest == 'CLD'] = 'Carlsbad'
df.DestCity.loc[df.Dest == 'MQT'] = 'Sawyer'
df.DestCity.loc[df.Dest == 'SCE'] = 'University Park'
df.DestCity.loc[df.Dest == 'HHH'] = 'Hilton Head'
df.DestState.loc[df.Dest == 'CLD'] = 'CA'
df.DestState.loc[df.Dest == 'MQT'] = 'MI'
df.DestState.loc[df.Dest == 'SCE'] = 'PA'
df.DestState.loc[df.Dest == 'HHH'] = 'SC'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [19]:
df.OriginCity.loc[df.Origin == 'CLD'] = 'Carlsbad'
df.OriginCity.loc[df.Origin == 'MQT'] = 'Sawyer'
df.OriginCity.loc[df.Origin == 'SCE'] = 'University Park'
df.OriginCity.loc[df.Origin == 'HHH'] = 'Hilton Head'
df.OriginState.loc[df.Origin == 'CLD'] = 'CA'
df.OriginState.loc[df.Origin == 'MQT'] = 'MI'
df.OriginState.loc[df.Origin == 'SCE'] = 'PA'
df.OriginState.loc[df.Origin == 'HHH'] = 'SC'

In [20]:
df.isnull().sum()

Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime                    0
CRSDepTime                 0
ArrTime                    0
CRSArrTime                 0
Carrier                    0
FlightNum                  0
TailNum                    0
ActualElapsedTime          0
CRSElapsedTime             0
AirTime                    0
ArrDelay                   0
DepDelay                   0
Origin                     0
Dest                       0
Distance                   0
TaxiIn                     0
TaxiOut                    0
Cancelled                  0
CancellationCode     6808769
Diverted                   0
CarrierDelay               0
WeatherDelay               0
NASDelay                   0
SecurityDelay              0
LateAircraftDelay          0
OriginAirport              0
OriginCity                 0
OriginState                0
OriginCountry              0
OriginLat                  0
OriginLong    

In [21]:
#Replacing nulls in unrecognized planes with unknowns 
df[['PlaneOwnership', 'Manufacturer', 'PlaneModel', 'PlaneStatus', 'AircraftType', 'Engine']] = df[['PlaneOwnership', 'Manufacturer', 'PlaneModel', 'PlaneStatus', 'AircraftType', 'Engine']].fillna(value = 'Unknown')
#Replacing nulls with median
df[['PlaneYear','PlaneIssueDay','PlaneIssueMonth','PlaneIssueYear']] = df[['PlaneYear','PlaneIssueDay','PlaneIssueMonth','PlaneIssueYear']].fillna(value=2000)

Working with Data Types

In [22]:
df.dtypes

Year                   int64
Month                  int64
DayofMonth             int64
DayOfWeek              int64
DepTime              float64
CRSDepTime             int64
ArrTime              float64
CRSArrTime             int64
Carrier               object
FlightNum              int64
TailNum               object
ActualElapsedTime    float64
CRSElapsedTime       float64
AirTime              float64
ArrDelay             float64
DepDelay             float64
Origin                object
Dest                  object
Distance               int64
TaxiIn                 int64
TaxiOut                int64
Cancelled              int64
CancellationCode      object
Diverted               int64
CarrierDelay           int64
WeatherDelay           int64
NASDelay               int64
SecurityDelay          int64
LateAircraftDelay      int64
OriginAirport         object
OriginCity            object
OriginState           object
OriginCountry         object
OriginLat            float64
OriginLong    

In [23]:
#Even though Plane Issue Day, Month and Year are numeric attributes only the Year can be said to be on a numeric scale

In [24]:
#mean = df[df['PlaneIssueYear']!='']['PlaneIssueYear'].mean()

In [25]:
df['PlaneIssueYear'] = df['PlaneIssueYear'].replace('',2000)

In [26]:
df.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'Carrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'OriginAirport', 'OriginCity', 'OriginState', 'OriginCountry',
       'OriginLat', 'OriginLong', 'DestAirport', 'DestCity', 'DestState',
       'DestLat', 'DestLong', 'PlaneOwnership', 'Manufacturer', 'PlaneModel',
       'PlaneStatus', 'AircraftType', 'Engine', 'PlaneYear', 'OriginCityIata',
       'CarrierName', 'FlightTimeStamp', 'PlaneIssueDay', 'PlaneIssueMonth',
       'PlaneIssueYear'],
      dtype='object')

In [27]:
#We would like to one hot encode categorical variables but doing so directly would introduce too much sparsity

In [28]:
#top_dest = df[['DestAirport','ArrDelay']].groupby('ArrDelay').sum()['ArrDelay'].nlargest(10).reset_index()

In [29]:
#Looking for Columns to one hot encode
print('Candidates for One Hot Encoding:')
print('Unique Carriers: ', len(df.Carrier.unique()))
print('Unique Origin Airports: ', len(df.Origin.unique()))
print('Unique Destination Airports: ', len(df.Dest.unique()))
print('Plane Manufacturer: ', len(df.Manufacturer.unique()))
print('Unique Plane Models: ', len(df.PlaneModel.unique()))
print('Unique Plane Status: ', len(df.PlaneStatus.unique()))
print('Unique Aircraft Type: ', len(df.AircraftType.unique()))

Candidates for One Hot Encoding:
Unique Carriers:  19
Unique Origin Airports:  294
Unique Destination Airports:  294
Plane Manufacturer:  36
Unique Plane Models:  162
Unique Plane Status:  3
Unique Aircraft Type:  5


In [None]:
df.shape

In [None]:
#Finally, one hot encoding categorical variables:
#pd.get_dummies(df, columns=['Carrier', ''])

In [None]:
#Plotting distribution of delays
ax = sns.distplot(df['ArrDelay']).set_title('Delays Distribution')
plt.show()