In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

# Overview

Our dataset for this project is a collection of all domestic flights in the United States from the year 2008. It contains a little over 7 million flights with 29 columns describing the date, time, flight time, carrier, locations, and delay time. What we want to do here is predict the total delay time based on the other columns in our dataset, and determine if a flight will be over 30 minutes late.

In [2]:
# Grab and process the raw data.
flights = pd.read_csv('data\\flights_2008.csv')
flights.head(10)

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2008,1,3,4,2003.0,1955,2211.0,2225,WN,335,...,4.0,8.0,0,,0,,,,,
1,2008,1,3,4,754.0,735,1002.0,1000,WN,3231,...,5.0,10.0,0,,0,,,,,
2,2008,1,3,4,628.0,620,804.0,750,WN,448,...,3.0,17.0,0,,0,,,,,
3,2008,1,3,4,926.0,930,1054.0,1100,WN,1746,...,3.0,7.0,0,,0,,,,,
4,2008,1,3,4,1829.0,1755,1959.0,1925,WN,3920,...,3.0,10.0,0,,0,2.0,0.0,0.0,0.0,32.0
5,2008,1,3,4,1940.0,1915,2121.0,2110,WN,378,...,4.0,10.0,0,,0,,,,,
6,2008,1,3,4,1937.0,1830,2037.0,1940,WN,509,...,3.0,7.0,0,,0,10.0,0.0,0.0,0.0,47.0
7,2008,1,3,4,1039.0,1040,1132.0,1150,WN,535,...,7.0,7.0,0,,0,,,,,
8,2008,1,3,4,617.0,615,652.0,650,WN,11,...,6.0,19.0,0,,0,,,,,
9,2008,1,3,4,1620.0,1620,1639.0,1655,WN,810,...,3.0,6.0,0,,0,,,,,


In [3]:
# Prints the size of our dataset
flights.shape

(7009728, 29)

# Data Cleaning

Our dataset has a few important issues that will need to be addressed before we construct our model. 
1. All of our observations are from the same year, so we don't need the year column. This would only be useful if our objective is determining airline arrivals over an extended period of time.
2. We have multiple NaN values in our cancellation, and delay columns. Since NaN most likely indicates that this value was not used, we will be replacing them with 0s.
3. On the topic of delays, we are only interested in the total amount of time delayed, so we will be combining all five of our delay columns into one, and placing it in a series of its own as our target variable.
4. Our categorical variables, Unique carrier, tail number, origin, and destination could be converted to dummy variables, but due to the number of unique values, it would introduce a very large number of features to our model. Eliminating them, and focusing on our numerical columns will provide us a much faster result.

In [4]:
# Create the target variable, total delay from all delay columns
Y = flights.iloc[:,24:].sum(axis=1)

# Removing year, categorical variables, and delay data from original data
flights.drop(flights.columns[[range(24,29)]], axis=1, inplace=True)
flights.drop(['Year', 'UniqueCarrier', 'TailNum', 'Origin', 'Dest', 'CancellationCode'], axis=1, inplace=True)

# Replace all instances of NaN with 0
flights = flights.fillna(0)

In [5]:
flights.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,CRSElapsedTime,AirTime,ArrDelay,DepDelay,Distance,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted
0,1,3,4,2003.0,1955,2211.0,2225,335,128.0,150.0,116.0,-14.0,8.0,810,4.0,8.0,0,0,0
1,1,3,4,754.0,735,1002.0,1000,3231,128.0,145.0,113.0,2.0,19.0,810,5.0,10.0,0,0,0
2,1,3,4,628.0,620,804.0,750,448,96.0,90.0,76.0,14.0,8.0,515,3.0,17.0,0,0,0
3,1,3,4,926.0,930,1054.0,1100,1746,88.0,90.0,78.0,-6.0,-4.0,515,3.0,7.0,0,0,0
4,1,3,4,1829.0,1755,1959.0,1925,3920,90.0,90.0,77.0,34.0,34.0,515,3.0,10.0,0,0,0


In [6]:
from sklearn.linear_model import LogisticRegression

In [None]:
X = flights[['DepTime', 'ArrTime']]

lr = LogisticRegression(C = 1e9)
fit = lr.fit(X, Y)

print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X)

In [10]:
flights.dtypes

Month                  int64
DayofMonth             int64
DayOfWeek              int64
DepTime              float64
CRSDepTime             int64
ArrTime              float64
CRSArrTime             int64
FlightNum              int64
ActualElapsedTime    float64
CRSElapsedTime       float64
AirTime              float64
ArrDelay             float64
DepDelay             float64
Distance               int64
TaxiIn               float64
TaxiOut              float64
Cancelled              int64
CancellationCode      object
Diverted               int64
dtype: object

# Source

Flight Data
http://stat-computing.org/dataexpo/2009/the-data.html 