Airline data sets for 2007 and 2008 was downloaded from  http://stat-computing.org/dataexpo/2009/the-data.html 

These datasets have 29 variables:
Year, Month, DayofMonth, DayOfWeek, DepTime, CRSDepTime, ArrTime, CRSArrTime, UniqueCarrier, FlightNum, TailNum, 
ActualElapsedTime, CRSElapsedTime, AirTime, ArrDelay, DepDelay, Origin, Dest, Distance, TaxiIn, TaxiOut,
Cancelled, CancellationCode, Diverted, CarrierDelay, WeatherDelay, NASDelay, SecurityDelay, LateAircraftDelay



In [1]:
# Importing libraries and the data set
from pandas import Series, DataFrame
import pandas as pd
import glob
import numpy as np
import os
import matplotlib.pylab as plt
%matplotlib inline 
plt.rcParams['figure.figsize'] = 12, 4  # that's default image size for this interactive session
import scipy
from scipy import stats
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.metrics import confusion_matrix,  precision_recall_fscore_support, accuracy_score
from sklearn.preprocessing import Binarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler, OneHotEncoder
scaler = StandardScaler()
from ggplot import *
from sklearn import metrics

### Check data volumes

In [2]:
flt_2007 = pd.read_csv("2007.csv")
flt_2007.shape

(7453215, 29)

In [3]:
flt_2008 = pd.read_csv("2008.csv")
flt_2008.shape

(7009728, 29)

In [3]:
ORD2007 = pd.read_csv("ORD2007.csv")
ORD2007.shape

(359169, 31)

In [4]:
ORD2008 = pd.read_csv("ORD2008.csv")
ORD2008.shape

  interactivity=interactivity, compiler=compiler, result=result)


(335440, 31)

### Prediction Model Using Month

In [15]:
data_2007 = pd.read_csv("ORD2007.csv")
data_2008 = pd.read_csv("ORD2008.csv")

# Create training set and test set
cols = ['Month']
train_y = data_2007['DepDelay']  >= 15
train_x = data_2007[cols]

test_y = data_2008['DepDelay']  >= 15
test_x = data_2008[cols]

print train_x.shape

(359169, 1)


In [10]:
# Create logistic regression model with L2 regularization
clf_lr = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
clf_lr.fit(train_x, train_y)

# Predict output labels on test set
pr = clf_lr.predict(test_x)

# display evaluation metrics
cm = confusion_matrix(test_y, pr)
print("Confusion matrix")
print(pd.DataFrame(cm))
report_lr = precision_recall_fscore_support(list(test_y), list(pr), average='binary')
print "\nprecision = %0.2f, recall = %0.2f, F1 = %0.2f, accuracy = %0.2f\n" % \
        (report_lr[0], report_lr[1], report_lr[2], accuracy_score(list(test_y), list(pr)))

Confusion matrix
        0       1
0  127960  111968
1   38797   56715

precision = 0.34, recall = 0.59, F1 = 0.43, accuracy = 0.55



### Prediction Model Using Month, Day of Month

In [2]:
#cols = ['DepDelay', 'Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'UniqueCarrier', 'Dest']
data_2007 = pd.read_csv("ORD2007.csv")
data_2008 = pd.read_csv("ORD2008.csv")

# Create training set and test set
cols = ['Month', 'DayofMonth']
train_y = data_2007['DepDelay']  >= 15
train_x = data_2007[cols]

test_y = data_2008['DepDelay']  >= 15
test_x = data_2008[cols]

print train_x.shape

(359169, 2)


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Create logistic regression model with L2 regularization
clf_lr = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
clf_lr.fit(train_x, train_y)

# Predict output labels on test set
pr = clf_lr.predict(test_x)

# display evaluation metrics
cm = confusion_matrix(test_y, pr)
print("Confusion matrix")
print(pd.DataFrame(cm))
report_lr = precision_recall_fscore_support(list(test_y), list(pr), average='binary')
print "\nprecision = %0.2f, recall = %0.2f, F1 = %0.2f, accuracy = %0.2f\n" % \
        (report_lr[0], report_lr[1], report_lr[2], accuracy_score(list(test_y), list(pr)))

Confusion matrix
        0       1
0  129131  110797
1   39482   56030

precision = 0.34, recall = 0.59, F1 = 0.43, accuracy = 0.55



### Prediction Model Using Month, Day of Month, Day of Week

In [4]:
#cols = ['DepDelay', 'Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'UniqueCarrier', 'Dest']
data_2007 = pd.read_csv("ORD2007.csv")
data_2008 = pd.read_csv("ORD2008.csv")

# Create training set and test set
cols = ['Month', 'DayofMonth','DayOfWeek']
train_y = data_2007['DepDelay']  >= 15
train_x = data_2007[cols]

test_y = data_2008['DepDelay']  >= 15
test_x = data_2008[cols]

print train_x.shape

(359169, 3)


In [5]:
# Create logistic regression model with L2 regularization
clf_lr = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
clf_lr.fit(train_x, train_y)

# Predict output labels on test set
pr = clf_lr.predict(test_x)

# display evaluation metrics
cm = confusion_matrix(test_y, pr)
print("Confusion matrix")
print(pd.DataFrame(cm))
report_lr = precision_recall_fscore_support(list(test_y), list(pr), average='binary')
print "\nprecision = %0.2f, recall = %0.2f, F1 = %0.2f, accuracy = %0.2f\n" % \
        (report_lr[0], report_lr[1], report_lr[2], accuracy_score(list(test_y), list(pr)))

Confusion matrix
        0       1
0  127526  112402
1   41586   53926

precision = 0.32, recall = 0.56, F1 = 0.41, accuracy = 0.54



### Prediction Model Using Month, Day of Month, Day of Week, Departure Hour

In [6]:
#cols = ['DepDelay', 'Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'UniqueCarrier', 'Dest']
data_2007 = pd.read_csv("ORD2007.csv")
data_2008 = pd.read_csv("ORD2008.csv")

# Create training set and test set
cols = ['Month', 'DayofMonth','DayOfWeek','Dep_Hour']
train_y = data_2007['DepDelay']  >= 15
train_x = data_2007[cols]

test_y = data_2008['DepDelay']  >= 15
test_x = data_2008[cols]

print train_x.shape

(359169, 4)


In [7]:
# Create logistic regression model with L2 regularization
clf_lr = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
clf_lr.fit(train_x, train_y)

# Predict output labels on test set
pr = clf_lr.predict(test_x)

# display evaluation metrics
cm = confusion_matrix(test_y, pr)
print("Confusion matrix")
print(pd.DataFrame(cm))
report_lr = precision_recall_fscore_support(list(test_y), list(pr), average='binary')
print "\nprecision = %0.2f, recall = %0.2f, F1 = %0.2f, accuracy = %0.2f\n" % \
        (report_lr[0], report_lr[1], report_lr[2], accuracy_score(list(test_y), list(pr)))

Confusion matrix
        0      1
0  149204  90724
1   34452  61060

precision = 0.40, recall = 0.64, F1 = 0.49, accuracy = 0.63



### Prediction Model Using Month, Day of Month, Day of Week, Departure Hour, Distance

In [8]:
#cols = ['DepDelay', 'Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'UniqueCarrier', 'Dest']
data_2007 = pd.read_csv("ORD2007.csv")
data_2008 = pd.read_csv("ORD2008.csv")

# Create training set and test set
cols = ['Month', 'DayofMonth','DayOfWeek','Dep_Hour', 'Distance']
train_y = data_2007['DepDelay']  >= 15
train_x = data_2007[cols]

test_y = data_2008['DepDelay']  >= 15
test_x = data_2008[cols]

print train_x.shape

(359169, 5)


In [9]:
# Create logistic regression model with L2 regularization
clf_lr = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
clf_lr.fit(train_x, train_y)

# Predict output labels on test set
pr = clf_lr.predict(test_x)

# display evaluation metrics
cm = confusion_matrix(test_y, pr)
print("Confusion matrix")
print(pd.DataFrame(cm))
report_lr = precision_recall_fscore_support(list(test_y), list(pr), average='binary')
print "\nprecision = %0.2f, recall = %0.2f, F1 = %0.2f, accuracy = %0.2f\n" % \
        (report_lr[0], report_lr[1], report_lr[2], accuracy_score(list(test_y), list(pr)))

Confusion matrix
        0      1
0  149976  89952
1   34662  60850

precision = 0.40, recall = 0.64, F1 = 0.49, accuracy = 0.63



### Prediction Model Using Month, Day of Month, Day of Week, Departure Hour, Distance, Carrier

In [17]:
#cols = ['DepDelay', 'Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'UniqueCarrier', 'Dest']
data_2007 = pd.read_csv("ORD2007.csv")
data_2008 = pd.read_csv("ORD2008.csv")

# Create training set and test set
cols = ['Month', 'DayofMonth','DayOfWeek','Dep_Hour', 'Distance','UniqueCarrier']
train_y = data_2007['DepDelay']  >= 15
train_x = data_2007[cols]
train_x['UniqueCarrier'] = pd.factorize(train_x['UniqueCarrier'])[0]
trainX_scl = scaler.fit_transform(train_x)

test_y = data_2008['DepDelay']  >= 15
test_x = data_2008[cols]
test_x['UniqueCarrier'] = pd.factorize(test_x['UniqueCarrier'])[0]
testX_scl = scaler.fit_transform(test_x)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [18]:
# Create logistic regression model with L2 regularization
clf_lr = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
clf_lr.fit(trainX_scl, train_y)

# Predict output labels on test set
pr = clf_lr.predict(testX_scl)

# display evaluation metrics
cm = confusion_matrix(test_y, pr)
print("Confusion matrix")
print(pd.DataFrame(cm))
report_lr = precision_recall_fscore_support(list(test_y), list(pr), average='binary')
print "\nprecision = %0.2f, recall = %0.2f, F1 = %0.2f, accuracy = %0.2f\n" % \
        (report_lr[0], report_lr[1], report_lr[2], accuracy_score(list(test_y), list(pr)))

Confusion matrix
        0      1
0  149354  90574
1   34465  61047

precision = 0.40, recall = 0.64, F1 = 0.49, accuracy = 0.63



### Prediction Model Using Month, Day of Month, Day of Week, Departure Hour, Distance, Carrier and Destination

In [25]:
#cols = ['DepDelay', 'Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'UniqueCarrier', 'Dest']
data_2007 = pd.read_csv("ORD2007.csv")
data_2008 = pd.read_csv("ORD2008.csv")

# Create training set and test set
cols = ['Month', 'DayofMonth','DayOfWeek','Dep_Hour', 'Distance','UniqueCarrier', 'Dest']
train_y = data_2007['DepDelay']  >= 15
train_x = data_2007[cols]
train_x['UniqueCarrier'] = pd.factorize(train_x['UniqueCarrier'])[0]
train_x['Dest'] = pd.factorize(train_x['Dest'])[0]
trainX_scl = scaler.fit_transform(train_x)

test_y = data_2008['DepDelay']  >= 15
test_x = data_2008[cols]
test_x['UniqueCarrier'] = pd.factorize(test_x['UniqueCarrier'])[0]
test_x['Dest'] = pd.factorize(test_x['Dest'])[0]
testX_scl = scaler.fit_transform(test_x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [26]:
# Create logistic regression model with L2 regularization
clf_lr = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
clf_lr.fit(trainX_scl, train_y)

# Predict output labels on test set
pr = clf_lr.predict(testX_scl)

# display evaluation metrics
cm = confusion_matrix(test_y, pr)
print("Confusion matrix")
print(pd.DataFrame(cm))
report_lr = precision_recall_fscore_support(list(test_y), list(pr), average='binary')
print "\nprecision = %0.2f, recall = %0.2f, F1 = %0.2f, accuracy = %0.2f\n" % \
        (report_lr[0], report_lr[1], report_lr[2], accuracy_score(list(test_y), list(pr)))

Confusion matrix
        0      1
0  149256  90672
1   34377  61135

precision = 0.40, recall = 0.64, F1 = 0.49, accuracy = 0.63

