### Baseline Model

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
!ls

2007.csv             baseline_model.ipynb df.csv
airports.csv         carriers.csv         planedata.csv


In [None]:
df = pd.read_csv('df.csv').drop('Unnamed: 0', axis=1)

In [None]:
df.shape

Removing Data Leakage Attributes

In [None]:
df.columns

In [None]:
df.drop('TailNum.1',axis=1,inplace=True)

In [None]:
df = df.rename(columns={'PlaneIssueData':'PlaneIssueDate'})

In [None]:
df.head()

The following columns will have to be removed in order to avoid Data Leakage:
- ArrTime, DepTime, ActualElapsedTime, AirTime, ArrDelay, DepDelay, TaxiIn, TaxiOut
- 'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
- 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'

The following columns are duplicates of other columns:
- OriginCityIata

The following columns should be removed as they carry no info for a linear model:
- 'OriginLat', 'OriginLong', 'DestLat','DestLong', 'CRSDepTime', 'CRSArrTime'

In [None]:
#df = df.drop(columns=['ArrTime', 'DepTime', 'ActualElapsedTime', 'AirTime',
#                 'DepDelay', 'TaxiIn', 'TaxiOut','Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
#                 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'OriginCityIata', 
#                  'OriginLat', 'OriginLong', 'DestLat','DestLong', 'CRSDepTime', 'CRSArrTime'])

Feature Engineering

In [None]:
df.PlaneIssueDate.head()

In [None]:
df['PlaneIssueDay'] = df['PlaneIssueDate'].str[:2]
df['PlaneIssueMonth'] = df['PlaneIssueDate'].str[3:5]
df['PlaneIssueYear'] = df['PlaneIssueDate'].str[6:10]

In [None]:
df = df.drop('PlaneIssueDate', axis=1)

In [None]:
# y = 'ArrDelay' 

In [None]:
#Removing DestCountry because all flights are within the US
len(df['DestCountry'].unique())

In [None]:
df = df.drop('DestCountry', axis=1)

We have dropped all columns that should be dropped at this point. Next: Nulls

In [None]:
#Checking for nulls in each column:
df.isnull().sum()

In [None]:
df[df.OriginCity.isnull()]['Origin'].unique()
#df[df.DestCity.isnull()]['Dest'].unique()

In [None]:
df.DestCity.loc[df.Dest == 'CLD'] = 'Carlsbad'
df.DestCity.loc[df.Dest == 'MQT'] = 'Sawyer'
df.DestCity.loc[df.Dest == 'SCE'] = 'University Park'
df.DestCity.loc[df.Dest == 'HHH'] = 'Hilton Head'
df.DestState.loc[df.Dest == 'CLD'] = 'CA'
df.DestState.loc[df.Dest == 'MQT'] = 'MI'
df.DestState.loc[df.Dest == 'SCE'] = 'PA'
df.DestState.loc[df.Dest == 'HHH'] = 'SC'

In [None]:
df.OriginCity.loc[df.Origin == 'CLD'] = 'Carlsbad'
df.OriginCity.loc[df.Origin == 'MQT'] = 'Sawyer'
df.OriginCity.loc[df.Origin == 'SCE'] = 'University Park'
df.OriginCity.loc[df.Origin == 'HHH'] = 'Hilton Head'
df.OriginState.loc[df.Origin == 'CLD'] = 'CA'
df.OriginState.loc[df.Origin == 'MQT'] = 'MI'
df.OriginState.loc[df.Origin == 'SCE'] = 'PA'
df.OriginState.loc[df.Origin == 'HHH'] = 'SC'

In [None]:
df.isnull().sum()

In [None]:
#Replacing nulls in unrecognized planes with unknowns 
df[['PlaneOwnership', 'Manufacturer', 'PlaneModel', 'PlaneStatus', 'AircraftType', 'Engine']] = df[['PlaneOwnership', 'Manufacturer', 'PlaneModel', 'PlaneStatus', 'AircraftType', 'Engine']].fillna(value = 'Unknown')
#Replacing nulls with median
df[['PlaneYear','PlaneIssueDay','PlaneIssueMonth','PlaneIssueYear']] = df[['PlaneYear','PlaneIssueDay','PlaneIssueMonth','PlaneIssueYear']].fillna(value=2000)

Working with Data Types

In [None]:
df.dtypes

In [None]:
#Even though Plane Issue Day, Month and Year are numeric attributes only the Year can be said to be on a numeric scale

In [None]:
#mean = df[df['PlaneIssueYear']!='']['PlaneIssueYear'].mean()

In [None]:
df['PlaneIssueYear'] = df['PlaneIssueYear'].replace('',2000)

In [None]:
df.columns

In [None]:
#We would like to one hot encode categorical variables but doing so directly would introduce too much sparsity

In [None]:
#top_dest = df[['DestAirport','ArrDelay']].groupby('ArrDelay').sum()['ArrDelay'].nlargest(10).reset_index()

In [None]:
#Looking for Columns to one hot encode
print('Candidates for One Hot Encoding:')
print('Unique Carriers: ', len(df.Carrier.unique()))
print('Unique Origin Airports: ', len(df.Origin.unique()))
print('Unique Destination Airports: ', len(df.Dest.unique()))
print('Plane Manufacturer: ', len(df.Manufacturer.unique()))
print('Unique Plane Models: ', len(df.PlaneModel.unique()))
print('Unique Plane Status: ', len(df.PlaneStatus.unique()))
print('Unique Aircraft Type: ', len(df.AircraftType.unique()))

In [None]:
df.shape

In [None]:
#Finally, one hot encoding categorical variables:
#pd.get_dummies(df, columns=['Carrier', ''])

In [None]:
#Plotting distribution of delays
ax = sns.distplot(df['ArrDelay']).set_title('Delays Distribution')
plt.show()