In [27]:
import pandas as pd

raw_data = pd.read_csv("data/2008.csv.bz2")


In [28]:
df = raw_data
display(df.head())
display(df.info())

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,...,4.0,8.0,0,,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,...,5.0,10.0,0,,0,,,,,
2,2008,1,3,4,628.0,620,804.0,750,WN,448,...,3.0,17.0,0,,0,,,,,
3,2008,1,3,4,926.0,930,1054.0,1100,WN,1746,...,3.0,7.0,0,,0,,,,,
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,...,3.0,10.0,0,,0,2.0,0.0,0.0,0.0,32.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7009728 entries, 0 to 7009727
Data columns (total 29 columns):
Year                 int64
Month                int64
DayofMonth           int64
DayOfWeek            int64
DepTime              float64
CRSDepTime           int64
ArrTime              float64
CRSArrTime           int64
UniqueCarrier        object
FlightNum            int64
TailNum              object
ActualElapsedTime    float64
CRSElapsedTime       float64
AirTime              float64
ArrDelay             float64
DepDelay             float64
Origin               object
Dest                 object
Distance             int64
TaxiIn               float64
TaxiOut              float64
Cancelled            int64
CancellationCode     object
Diverted             int64
CarrierDelay         float64
WeatherDelay         float64
NASDelay             float64
SecurityDelay        float64
LateAircraftDelay    float64
dtypes: float64(14), int64(10), object(5)
memory usage: 1.5+ GB


None

# Clean the data

In [29]:
# Get rid of cancelled and diverted flights
display(df.Cancelled.value_counts())
print('-'*100)
display(df.Diverted.value_counts())


0    6872294
1     137434
Name: Cancelled, dtype: int64

----------------------------------------------------------------------------------------------------


0    6992463
1      17265
Name: Diverted, dtype: int64

In [30]:
df.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')

In [31]:
df = df.loc[((df.Cancelled == 0) & (df.Diverted == 0))].drop(['Cancelled', 'CancellationCode', 'Diverted'], 1)

# Look at null values

In [32]:
df.isna().any()

Year                 False
Month                False
DayofMonth           False
DayOfWeek            False
DepTime              False
CRSDepTime           False
ArrTime              False
CRSArrTime           False
UniqueCarrier        False
FlightNum            False
TailNum               True
ActualElapsedTime    False
CRSElapsedTime       False
AirTime              False
ArrDelay             False
DepDelay             False
Origin               False
Dest                 False
Distance             False
TaxiIn               False
TaxiOut              False
CarrierDelay          True
WeatherDelay          True
NASDelay              True
SecurityDelay         True
LateAircraftDelay     True
dtype: bool

In [45]:
nulls = df.loc[:, ~df.isin['CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']

Year                 5330294
Month                5330294
DayofMonth           5330294
DayOfWeek            5330294
DepTime              5330294
CRSDepTime           5330294
ArrTime              5330294
CRSArrTime           5330294
UniqueCarrier        5330294
FlightNum            5330294
TailNum              5330291
ActualElapsedTime    5330294
CRSElapsedTime       5330294
AirTime              5330294
ArrDelay             5330294
DepDelay             5330294
Origin               5330294
Dest                 5330294
Distance             5330294
TaxiIn               5330294
TaxiOut              5330294
CarrierDelay               0
WeatherDelay               0
NASDelay                   0
SecurityDelay              0
LateAircraftDelay          0
dtype: int64

# Drop null columns

In [42]:
df.dropna(axis=1).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6855029 entries, 0 to 7009727
Data columns (total 20 columns):
Year                 int64
Month                int64
DayofMonth           int64
DayOfWeek            int64
DepTime              float64
CRSDepTime           int64
ArrTime              float64
CRSArrTime           int64
UniqueCarrier        object
FlightNum            int64
ActualElapsedTime    float64
CRSElapsedTime       float64
AirTime              float64
ArrDelay             float64
DepDelay             float64
Origin               object
Dest                 object
Distance             int64
TaxiIn               float64
TaxiOut              float64
dtypes: float64(9), int64(8), object(3)
memory usage: 1.4+ GB


# Only use data where arrival plus departure delay is > 30 minutes
    try getting data 

In [None]:
# Try getting 

In [None]:
df['delayed'] = pd.get_dummies(df[['DepDelay','ArrDelay']].sum(axis=1) > 30, drop_first=True)

# you can almost see that arrival delay is a bit higher in frequency

# Try KNN to predict if a plane will be more than 30 minutes late

In [None]:
from sklearn.model_selection import train_test_split
data = df.dropna(axis=1, how='any')

X = data.drop(['delayed', 'UniqueCarrier', 'Origin', 'Dest'], 1)
y = data['delayed']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

print('train:', neigh.score(X_train, y_train))
print('test:', neigh.score(X_test, y_test))
print('cross-val:',cross_val_score(neigh, X_test, y_test))

# Try KNN Regression to predict how many minutes a plane later than 30 minutes will be delayed.

In [None]:
data = df.drop(['UniqueCarrier', 
                'Origin', 'Dest', 'TailNum', 'CancellationCode', 
                "CarrierDelay", "WeatherDelay","NASDelay", 
                "SecurityDelay", "LateAircraftDelay", "Cancelled", 
                "Diverted", "delayed"], 1)[df.delayed == 1]

data.head()

# Convert DepTime and ArrTime to YYYY-MM-DD HH:mm

In [None]:
data['Dep'] = pd.to_datetime(
    pd.DataFrame(
        {
            'year': data.Year,
            'month': data.Month,
            'day': data.DayofMonth,
            'minute': data.DepTime,
        }
    )
)

data['Arr'] = pd.to_datetime(
    pd.DataFrame(
        {
            'year': data.Year,
            'month': data.Month,
            'day': data.DayofMonth,
            'minute': data.ArrTime,
        }
    )
)

data.head()

In [None]:
data.drop(['Year', 'Month', "DayofMonth",	"DayOfWeek",	"DepTime",	"CRSDepTime",	"ArrTime",	"CRSArrTime"	], 1)

In [None]:
pd.to_datetime(data[['CRSDepTime', 'Year']], unit='m')

In [None]:
data.info()

In [None]:
from sklearn import neighbors
from sklearn.model_selection import cross_val_score

X = data.drop(['ArrDelay', 'DepDelay', 'Date'], 1)
y = data['ArrDelay'] + data['DepDelay']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

# Build our model.
knn = neighbors.KNeighborsRegressor(n_neighbors=15)

knn.fit(X_train, y_train)

from sklearn.model_selection import cross_val_score
score = cross_val_score(knn, X_test, y_test, cv=30)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))



In [None]:
data = df[df['ArrDelay'] + df['DepDelay'] > 30]

X = data.drop(['ArrDelay', 'DepDelay', 'UniqueCarrier', 'Origin', 'Dest', 'TailNum'], 1)
y = data['ArrDelay'] + data['DepDelay']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

knnreg = KNeighborsRegressor(n_neighbors=2)
knnreg.fit(X_train, y_train)

print('train:', neigh.score(X_train, y_train))
print('test:', neigh.score(X_test, y_test))
print('cross-val:',cross_val_score(neigh, X_test, y_test))

In [None]:
from sklearn.neighbors import KNeighborsRegressor

X = df.drop(['DepDelay','ArrDelay'], 1)
y = df[['DepDelay','ArrDelay']].sum(axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)


