In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random as rnd

from sklearn.model_selection import cross_val_score

In [2]:
#raw = pd.read_csv('data/2007flightrecords.csv')
raw = pd.read_csv('http://stat-computing.org/dataexpo/2009/2008.csv.bz2')

#Drop all cancelled flights
raw = raw[raw['Cancelled'] == 0]
raw.drop(columns = ['Cancelled'], inplace = True)

raw.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,Distance,TaxiIn,TaxiOut,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,...,810,4.0,8.0,,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,...,810,5.0,10.0,,0,,,,,
2,2008,1,3,4,628.0,620,804.0,750,WN,448,...,515,3.0,17.0,,0,,,,,
3,2008,1,3,4,926.0,930,1054.0,1100,WN,1746,...,515,3.0,7.0,,0,,,,,
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,...,515,3.0,10.0,,0,2.0,0.0,0.0,0.0,32.0


In [3]:
raw_samp = raw.sample(n = 1000)#raw.iloc[:1000] #Take a small sample to test row manipulation

#Year column is about as useful as a bathing suit in a nunery, but I will select my features later.

In [4]:
print(raw_samp.columns)
print(raw_samp['UniqueCarrier'].unique())

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'CancellationCode', 'Diverted', 'CarrierDelay', 'WeatherDelay',
       'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')
['NW' 'AS' 'US' 'OO' 'WN' 'AA' 'XE' 'DL' 'UA' 'EV' 'MQ' 'YV' '9E' 'OH'
 'CO' 'AQ' 'B6' 'F9' 'FL' 'HA']


In [5]:
#Set up outcome. Is it more than 30 minutes late?
#raw_samp[['DepTime', 'CRSDepTime','ArrTime', 'CRSArrTime', 'ArrDelay']]

#Luckily, we have the ArrDelay already
#All we have to do is:
raw_samp['Late'] = np.where(raw_samp['ArrDelay'] >= 30, 1, 0)
#print(raw_samp['Late'])

#Create binary feature for the 'UniqueCarrier' columns
for carrier in raw_samp['UniqueCarrier'].unique():
    raw_samp[str(carrier)] = np.where(raw_samp['UniqueCarrier'] == carrier, 1, 0)
    
print(raw_samp.columns)

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'CancellationCode', 'Diverted', 'CarrierDelay', 'WeatherDelay',
       'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Late', 'NW', 'AS',
       'US', 'OO', 'WN', 'AA', 'XE', 'DL', 'UA', 'EV', 'MQ', 'YV', '9E', 'OH',
       'CO', 'AQ', 'B6', 'F9', 'FL', 'HA'],
      dtype='object')


In [6]:
#Set up features and outcomes for the sample
Y_samp = raw_samp['Late']

carriers = np.asarray(raw_samp['UniqueCarrier'].unique())
print(type(carriers))
X_samp = raw_samp[['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'AS', 'EV',
       'AA', 'OH', 'MQ', 'XE', 'US', 'YV', 'WN', 'OO', 'NW', 'CO', 'UA', 'F9',
       'FL', 'DL', '9E', 'HA', 'B6', 'AQ']]

X_samp.head()

<class 'numpy.ndarray'>


Unnamed: 0,Month,DayofMonth,DayOfWeek,Distance,AS,EV,AA,OH,MQ,XE,...,NW,CO,UA,F9,FL,DL,9E,HA,B6,AQ
1632291,3,27,4,762,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
534495,1,3,4,1449,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4504496,8,8,5,683,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5565149,10,20,1,802,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5981906,11,15,6,344,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
#Lets try KNN
from sklearn.neighbors import KNeighborsClassifier

neighbors = KNeighborsClassifier(n_neighbors = 10)

cross_val_score(neighbors, X_samp, Y_samp, cv = 10)

array([0.89108911, 0.89108911, 0.89108911, 0.89      , 0.89      ,
       0.89      , 0.89      , 0.8989899 , 0.8989899 , 0.8989899 ])

# Training on a random sample of the raw data (n = 1000)

__KNN Scores, n_neighbors = 1__

Even with no optimization and only looking at each point's nearest neighbor, we get halfway decent scores.

[0.8019802 , 0.75247525, 0.81188119, 0.75, 0.77 ,0.81, 0.75, 0.84848485, 0.75757576, 0.83838384]

But we can do a lot better.

__KNN Scores, n_neighbors = 10__

Things get better with a magnitude increase in neighbors. We get better values and higher scores.

[0.87128713, 0.87128713, 0.87128713, 0.87, 0.87, 0.87, 0.87, 0.87878788, 0.87878788, 0.87878788]



In [8]:
#Time to validate the entire dataset.

#Create Carrier cols
for carrier in raw['UniqueCarrier'].unique():
    raw[str(carrier)] = np.where(raw['UniqueCarrier'] == carrier, 1, 0)

#Create Late col
raw['Late'] = np.where(raw['ArrDelay'] >= 30, 1, 0)

In [9]:
print(raw.columns)

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'CancellationCode', 'Diverted', 'CarrierDelay', 'WeatherDelay',
       'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'WN', 'XE', 'YV',
       'OH', 'OO', 'UA', 'US', 'DL', 'EV', 'F9', 'FL', 'HA', 'MQ', 'NW', '9E',
       'AA', 'AQ', 'AS', 'B6', 'CO', 'Late'],
      dtype='object')


In [10]:
X = raw[['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'WN', 'XE', 'YV',
       'OH', 'OO', 'UA', 'US', 'DL', 'EV', 'F9', 'FL', 'HA', 'MQ', 'NW', '9E',
       'AA', 'AQ', 'AS', 'B6', 'CO']]
Y = raw['Late']

In [11]:
#Test, train

i_have_time_to_waste = False

if i_have_time_to_waste == True:
    neighbors_complete = KNeighborsClassifier(n_neighbors = 10)
    cross_val_score(neighbors_complete, X, Y, cv = 10)

array([0.77062119, 0.7342098 , 0.71577638, 0.72477482, 0.54647352,
       0.45114074, 0.50691254, 0.85067568, 0.86127332, 0.7124506 ])

__Entire Data score__
[0.77062119, 0.7342098 , 0.71577638, 0.72477482, 0.54647352, 0.45114074, 0.50691254, 0.85067568, 0.86127332, 0.7124506 ]


Clearly, overfitting is a major issue. Luckily, training on small samples of the data seems to generate a pretty good model.