In [1]:
import the_beginning
from initial_cleaning import initial_cleaning

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import preprocessing

%matplotlib inline
sns.set_style('white')
pd.options.display.float_format = '{:.3f}'.format
plt.rcParams['figure.figsize'] = (20.0, 10.0)

In [2]:
y2007 = pd.DataFrame(initial_cleaning('2007.csv.bz2'))

   Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  ArrTime  \
0  2007      1           1          1 1232.000        1225 1341.000   
1  2007      1           1          1 1918.000        1905 2043.000   
2  2007      1           1          1 2206.000        2130 2334.000   
3  2007      1           1          1 1230.000        1200 1356.000   
4  2007      1           1          1  831.000         830  957.000   

   CRSArrTime UniqueCarrier  FlightNum        ...         TaxiIn  TaxiOut  \
0        1340            WN       2891        ...              4       11   
1        2035            WN        462        ...              5        6   
2        2300            WN       1229        ...              6        9   
3        1330            WN       1355        ...              3        8   
4        1000            WN       2278        ...              3        9   

   Cancelled  CancellationCode  Diverted  CarrierDelay WeatherDelay NASDelay  \
0          0               NaN

In [3]:
y2007.shape

(7453188, 29)

In [4]:
y2007 = y2007.drop(['CancellationCode'], 1)

In [5]:
y2007 = y2007.dropna()

In [6]:
y2007.shape

(7275261, 28)

In [7]:
y2007['DepTime'].describe()

count   7275261.000
mean       1339.201
std         479.883
min           1.000
25%         930.000
50%        1329.000
75%        1733.000
max        2400.000
Name: DepTime, dtype: float64

In [8]:
y2007['CRSDepTime'].describe()

count   7275261.000
mean       1329.359
std         464.793
min           1.000
25%         926.000
50%        1320.000
75%        1720.000
max        2359.000
Name: CRSDepTime, dtype: float64

In [9]:
# determine if the flight arrived late

y2007['late'] = np.where(y2007['ArrTime'] - y2007['CRSArrTime'] >30, 1, 0)

In [10]:
y2007 = y2007.drop(['Origin', 'Dest', 'TailNum', 'UniqueCarrier'], 1)

In [11]:
# Define the training and test sizes.
trainsize = int(y2007.shape[0] / 2)
y2007_test = y2007.iloc[trainsize:, :].copy()
y2007_train = y2007.iloc[:trainsize, :].copy()

# Set up the regression model to predict defaults using all other
# variables as features.
regr1 = linear_model.LinearRegression()
Y_train = y2007_train['late'].values.reshape(-1, 1)
X_train = y2007_train

Y_test = y2007_test['late'].values.reshape(-1, 1)
X_test = y2007_test

regr1.fit(X_train, Y_train)
print('\nR-squared simple model:')
print(regr1.score(X_train, Y_train))

#Store the parameter estimates.
origparams = np.append(regr1.coef_, regr1.intercept_)


R-squared simple model:
1.0


In [12]:
print(regr1.score(X_test, Y_test))

print(regr1.score(X_train, Y_train))

1.0
1.0


In [13]:
lass = linear_model.Lasso(alpha=.08)
lassfit = lass.fit(X_train, Y_train)
print('R² for the model with few features:')
print(lass.score(X_train, Y_train))
lassparams = np.append(lassfit.coef_, lassfit.intercept_)
print('\nParameter estimates for the model with few features:')
print(lassparams)

R² for the model with few features:
0.541438213137

Parameter estimates for the model with few features:
[  0.00000000e+00  -0.00000000e+00  -0.00000000e+00   0.00000000e+00
   1.38412690e-04  -8.94075999e-05   3.93619745e-04  -3.88376501e-04
   3.32571298e-06   8.30570872e-04  -3.03629352e-04  -0.00000000e+00
   4.87670850e-03  -2.68760918e-04  -4.38487849e-05   0.00000000e+00
   1.06726353e-03   0.00000000e+00   0.00000000e+00  -0.00000000e+00
  -2.23449513e-04   7.99717371e-04   0.00000000e+00   9.98089199e-04
   8.70504373e-02  -7.67471699e-03]


In [14]:
print(lass.score(X_test, Y_test))

print(lass.score(X_train, Y_train))

0.545525787573
0.541438213137


In [15]:
ridgeregr = linear_model.Ridge(alpha=.57, fit_intercept=False) 
ridgeregr.fit(X_train, Y_train)
print(ridgeregr.score(X_train, Y_train))
ridgeparams = ridgeregr.coef_[0]
print(ridgeparams)


print('\nParameter Estimates for the same predictors for the small model:')
compare = np.column_stack((lassparams, ridgeparams))
prettycompare = np.array2string(
    compare,
    formatter={'float_kind':'{0:.3f}'.format})
print(prettycompare)

0.999999999998
[  6.60616217e-12  -3.49712065e-09  -5.98286883e-10   2.56766683e-09
   3.01532241e-10  -2.03071142e-10   7.83389155e-10  -7.75806100e-10
   7.21953132e-12  -8.40299712e-09   1.66242477e-08  -7.60698492e-09
   2.91327655e-08  -1.96931652e-08  -3.22661173e-11  -7.56322092e-09
  -5.21780745e-09   0.00000000e+00   0.00000000e+00  -6.61360968e-10
  -2.47622955e-09   1.12287743e-09   8.61798000e-09   2.00963613e-09
   9.99998184e-01]

Parameter Estimates for the same predictors for the small model:


ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [16]:
print(ridgeregr.score(X_test, Y_test))

print(ridgeregr.score(X_train, Y_train))

0.999999999998
0.999999999998


**The linear and ridge regression models were both able to account for more than 99% of the variance in the dataset, while the lasso regression model was only able to account for 54%.  The ridge regression model would appear to be the best performer of the 3.  It also appears the ridge and linear regression models are predicting at greater than 99% accuracy.**