In [1]:
import the_beginning
from initial_cleaning import initial_cleaning

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import preprocessing

%matplotlib inline
sns.set_style('white')
pd.options.display.float_format = '{:.3f}'.format
plt.rcParams['figure.figsize'] = (20.0, 10.0)

In [2]:
y2007 = pd.DataFrame(initial_cleaning('2007.csv.bz2'))

   Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  ArrTime  \
0  2007      1           1          1 1232.000        1225 1341.000   
1  2007      1           1          1 1918.000        1905 2043.000   
2  2007      1           1          1 2206.000        2130 2334.000   
3  2007      1           1          1 1230.000        1200 1356.000   
4  2007      1           1          1  831.000         830  957.000   

   CRSArrTime UniqueCarrier  FlightNum        ...         TaxiIn  TaxiOut  \
0        1340            WN       2891        ...              4       11   
1        2035            WN        462        ...              5        6   
2        2300            WN       1229        ...              6        9   
3        1330            WN       1355        ...              3        8   
4        1000            WN       2278        ...              3        9   

   Cancelled  CancellationCode  Diverted  CarrierDelay WeatherDelay NASDelay  \
0          0               NaN

In [3]:
y2007.shape

(7453188, 29)

In [4]:
y2007 = y2007.drop(['CancellationCode'], 1)

In [5]:
y2007 = y2007.dropna()

In [6]:
y2007.shape

(7275261, 28)

In [7]:
y2007['DepTime'].describe()

count   7275261.000
mean       1339.201
std         479.883
min           1.000
25%         930.000
50%        1329.000
75%        1733.000
max        2400.000
Name: DepTime, dtype: float64

In [8]:
y2007['CRSDepTime'].describe()

count   7275261.000
mean       1329.359
std         464.793
min           1.000
25%         926.000
50%        1320.000
75%        1720.000
max        2359.000
Name: CRSDepTime, dtype: float64

In [9]:
# determine if the flight arrived late

y2007['late'] = np.where(y2007['ArrTime'] - y2007['CRSArrTime'] >30, 1, 0)

In [10]:
y2007 = y2007.drop(['Origin', 'Dest', 'TailNum', 'UniqueCarrier'], 1)

In [11]:
# Define the training and test sizes

trainsize = int(y2007.shape[0] / 2)
y2007_test = y2007.iloc[trainsize:, :].copy()
y2007_train = y2007.iloc[:trainsize, :].copy()

# Set up the regression model to predict defaults using all other variables as features.

regr1 = linear_model.LinearRegression()
Y_train = y2007_train['late'].values.reshape(-1, 1)
X_train = y2007_train

Y_test = y2007_test['late'].values.reshape(-1, 1)
X_test = y2007_test

regr1.fit(X_train, Y_train)
print('\nR-squared simple model:')
print(regr1.score(X_train, Y_train))

# Store parameter estimates

origparams = np.append(regr1.coef_, regr1.intercept_)


R-squared simple model:
1.0


In [12]:
Y_test.shape

(3637631, 1)

In [13]:
X_test.shape

(3637631, 25)

In [14]:
print(regr1.score(X_test, Y_test))

print(regr1.score(X_train, Y_train))

1.0
1.0


In [15]:
lass = linear_model.Lasso(alpha=.08)
lassfit = lass.fit(X_train, Y_train)
print('R² for the model with few features:')
print(lass.score(X_train, Y_train))
lassparams = np.append(lassfit.coef_, lassfit.intercept_)
print('\nParameter estimates for the model with few features:')
print(lassparams)

R² for the model with few features:
0.541438213137

Parameter estimates for the model with few features:
[  0.00000000e+00  -0.00000000e+00  -0.00000000e+00   0.00000000e+00
   1.38412690e-04  -8.94075999e-05   3.93619745e-04  -3.88376501e-04
   3.32571298e-06   8.30570872e-04  -3.03629352e-04  -0.00000000e+00
   4.87670850e-03  -2.68760918e-04  -4.38487849e-05   0.00000000e+00
   1.06726353e-03   0.00000000e+00   0.00000000e+00  -0.00000000e+00
  -2.23449513e-04   7.99717371e-04   0.00000000e+00   9.98089199e-04
   8.70504373e-02  -7.67471699e-03]


In [16]:
print(lass.score(X_test, Y_test))

print(lass.score(X_train, Y_train))

0.545525787573
0.541438213137


In [17]:
ridgeregr = linear_model.Ridge(alpha=.57, fit_intercept=False) 
ridgeregr.fit(X_train, Y_train)
print(ridgeregr.score(X_train, Y_train))
ridgeparams = ridgeregr.coef_[0]
print(ridgeparams)


print('\nParameter Estimates for the same predictors for the small model:')
compare = np.column_stack((lassparams, ridgeparams))
prettycompare = np.array2string(
    compare,
    formatter={'float_kind':'{0:.3f}'.format})
print(prettycompare)

0.999999999998
[  6.60616217e-12  -3.49712065e-09  -5.98286883e-10   2.56766683e-09
   3.01532241e-10  -2.03071142e-10   7.83389155e-10  -7.75806100e-10
   7.21953132e-12  -8.40299712e-09   1.66242477e-08  -7.60698492e-09
   2.91327655e-08  -1.96931652e-08  -3.22661173e-11  -7.56322092e-09
  -5.21780745e-09   0.00000000e+00   0.00000000e+00  -6.61360968e-10
  -2.47622955e-09   1.12287743e-09   8.61798000e-09   2.00963613e-09
   9.99998184e-01]

Parameter Estimates for the same predictors for the small model:


ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [18]:
lassparams.shape

(26,)

In [19]:
ridgeparams.shape

(25,)

In [20]:
print(ridgeregr.score(X_test, Y_test))

print(ridgeregr.score(X_train, Y_train))

0.999999999998
0.999999999998


In [21]:
# cross validation scores for 2007 data

from sklearn.model_selection import cross_val_score
cross_val_score(ridgeregr, X_test, Y_test, cv=10)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

**The linear and ridge regression models were both able to account for more than 99% of the variance in the dataset, while the lasso regression model was only able to account for 54%.  The ridge regression model would appear to be the best performer of the 3.  It also appears the ridge and linear regression models are predicting at greater than 99% accuracy.**

In [22]:
y2008 = pd.DataFrame(initial_cleaning('2008.csv.bz2'))

   Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  ArrTime  \
0  2008      1           3          4 2003.000        1955 2211.000   
1  2008      1           3          4  754.000         735 1002.000   
2  2008      1           3          4  628.000         620  804.000   
3  2008      1           3          4  926.000         930 1054.000   
4  2008      1           3          4 1829.000        1755 1959.000   

   CRSArrTime UniqueCarrier  FlightNum        ...         TaxiIn  TaxiOut  \
0        2225            WN        335        ...          4.000    8.000   
1        1000            WN       3231        ...          5.000   10.000   
2         750            WN        448        ...          3.000   17.000   
3        1100            WN       1746        ...          3.000    7.000   
4        1925            WN       3920        ...          3.000   10.000   

   Cancelled  CancellationCode  Diverted  CarrierDelay WeatherDelay NASDelay  \
0          0               NaN

In [23]:
y2008 = y2008.drop(['CancellationCode', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'], 1)

In [24]:
y2008 = y2008.drop(['Origin', 'Dest', 'TailNum', 'UniqueCarrier'], 1)

In [25]:
y2008 = y2008.dropna()

In [26]:
y2008.shape

(6855025, 19)

In [27]:
# determine if the flight arrived late

y2008['late'] = np.where(y2008['ArrTime'] - y2008['CRSArrTime'] >30, 1, 0)

In [28]:
# Define the training and test sizes.

trainsize = int(y2008.shape[0] / 2)
y2008_test = y2008.iloc[trainsize:, :].copy()
y2008_train = y2008.iloc[:trainsize, :].copy()

# Set up the regression model to predict defaults using all other
# variables as features.
regr2 = linear_model.LinearRegression()
Y_train2 = y2008_train['late'].values.reshape(-1, 1)
X_train2 = y2008_train

Y_test2 = y2008_test['late'].values.reshape(-1, 1)
X_test2 = y2008_test

regr2.fit(X_train2, Y_train2)
print('\nR-squared simple model:')
print(regr2.score(X_train2, Y_train2))

#Store the parameter estimates.
newparams = np.append(regr2.coef_, regr2.intercept_)


R-squared simple model:
1.0


In [29]:
lass2 = linear_model.Lasso(alpha=.08)
lassfit2 = lass2.fit(X_train2, Y_train2)
print('R² for the model with few features:')
print(lass2.score(X_train2, Y_train2))
lassparams2 = np.append(lassfit2.coef_, lassfit2.intercept_)
print('\nParameter estimates for the model with few features:')
print(lassparams2)

R² for the model with few features:
0.525377682815

Parameter estimates for the model with few features:
[  0.00000000e+00  -0.00000000e+00  -0.00000000e+00   0.00000000e+00
   1.45391534e-04  -9.59083673e-05   3.83127082e-04  -3.78933082e-04
   2.31461103e-06   1.27514651e-03  -2.62281445e-04  -0.00000000e+00
   5.25481645e-03  -5.54225134e-05  -1.01503924e-04   0.00000000e+00
   9.97119121e-04   0.00000000e+00   0.00000000e+00   7.09211444e-02
  -1.93265264e-02]


In [30]:
print(lass2.score(X_test2, Y_test2))

print(lass2.score(X_train2, Y_train2))

0.528985567939
0.525377682815


In [31]:
ridgeregr2 = linear_model.Ridge(alpha=.57, fit_intercept=False) 
ridgeregr2.fit(X_train2, Y_train2)
print(ridgeregr2.score(X_train2, Y_train2))
ridgeparams2 = ridgeregr2.coef_[0]
print(ridgeparams2)


print('\nParameter Estimates for the same predictors for the small model:')
compare = np.column_stack((lassparams2, ridgeparams2))
prettycompare = np.array2string(
    compare,
    formatter={'float_kind':'{0:.3f}'.format})
print(prettycompare)

0.999999999998
[  3.60645657e-12  -4.25255044e-09  -6.58826997e-10   1.92757174e-09
   3.41935192e-10  -2.36460025e-10   8.04514163e-10  -7.94850627e-10
   5.54514779e-12  -1.29217674e-07   1.82333622e-07  -5.20305550e-08
   1.96359818e-07  -1.85619304e-07  -8.56059485e-11  -5.20244410e-08
  -4.93160615e-08   0.00000000e+00   0.00000000e+00   9.99998045e-01]

Parameter Estimates for the same predictors for the small model:


ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [32]:
print(ridgeregr2.score(X_test2, Y_test2))

print(ridgeregr2.score(X_train2, Y_train2))

0.999999999998
0.999999999998


**In terms of model accuracy and performance, there wasn't much difference in accuracy between the datasets.  The accuracy was slightly lower for lasso regression for the 2008 airline data, however, the difference was nominal.  The ridge regression model predicted >99% for both datasets.**