In [1]:
import the_beginning
from initial_cleaning import initial_cleaning

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import preprocessing

%matplotlib inline
sns.set_style('white')
pd.options.display.float_format = '{:.3f}'.format
plt.rcParams['figure.figsize'] = (20.0, 10.0)

In [2]:
y2007 = pd.DataFrame(initial_cleaning('/Users/jamilab/Data_Science/Data_Sets/2007.csv.bz2'))

   Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  ArrTime  \
0  2007      1           1          1 1232.000        1225 1341.000   
1  2007      1           1          1 1918.000        1905 2043.000   
2  2007      1           1          1 2206.000        2130 2334.000   
3  2007      1           1          1 1230.000        1200 1356.000   
4  2007      1           1          1  831.000         830  957.000   

   CRSArrTime UniqueCarrier  FlightNum        ...         TaxiIn  TaxiOut  \
0        1340            WN       2891        ...              4       11   
1        2035            WN        462        ...              5        6   
2        2300            WN       1229        ...              6        9   
3        1330            WN       1355        ...              3        8   
4        1000            WN       2278        ...              3        9   

   Cancelled  CancellationCode  Diverted  CarrierDelay WeatherDelay NASDelay  \
0          0               NaN

In [3]:
y2007.shape

(7453188, 29)

In [4]:
y2007 = y2007.drop(['CancellationCode'], 1)

In [5]:
y2007 = y2007.dropna()

In [6]:
y2007.shape

(7275261, 28)

In [7]:
y2007['DepTime'].describe()

count   7275261.000
mean       1339.201
std         479.883
min           1.000
25%         930.000
50%        1329.000
75%        1733.000
max        2400.000
Name: DepTime, dtype: float64

In [8]:
y2007['CRSDepTime'].describe()

count   7275261.000
mean       1329.359
std         464.793
min           1.000
25%         926.000
50%        1320.000
75%        1720.000
max        2359.000
Name: CRSDepTime, dtype: float64

In [9]:
# determine if the flight arrived late

y2007['late'] = np.where(y2007['ArrTime'] - y2007['CRSArrTime'] >30, 1, 0)

In [10]:
y2007 = y2007.drop(['Origin', 'Dest', 'TailNum', 'UniqueCarrier'], 1)

In [11]:
y2007.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,FlightNum,ActualElapsedTime,...,TaxiIn,TaxiOut,Cancelled,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,late
0,2007,1,1,1,1232.0,1225,1341.0,1340,2891,69.0,...,4,11,0,0,0,0,0,0,0,0
1,2007,1,1,1,1918.0,1905,2043.0,2035,462,85.0,...,5,6,0,0,0,0,0,0,0,0
2,2007,1,1,1,2206.0,2130,2334.0,2300,1229,88.0,...,6,9,0,0,3,0,0,0,31,1
3,2007,1,1,1,1230.0,1200,1356.0,1330,1355,86.0,...,3,8,0,0,23,0,0,0,3,0
4,2007,1,1,1,831.0,830,957.0,1000,2278,86.0,...,3,9,0,0,0,0,0,0,0,0


# 2007 Airline data modeled with linear, lasso & ridge regression using raw data

In [12]:
# Define the training and test sizes

trainsize = int(y2007.shape[0] / 2)
y2007_test = y2007.iloc[trainsize:, :].copy()
y2007_train = y2007.iloc[:trainsize, :].copy()

# Set up the linear regression model to predict defaults using all other variables as features.

regr07 = linear_model.LinearRegression()
Y_train = y2007_train['late'].values.reshape(-1, 1)
X_train = y2007_train.drop(['late'], 1)

Y_test = y2007_test['late'].values.reshape(-1, 1)
X_test = y2007_test.drop(['late'], 1)

regr07.fit(X_train, Y_train)
print('\nR-squared for raw data:')
print(regr07.score(X_test, Y_test))

# Store parameter estimates

origparams07 = np.append(regr07.coef_, regr07.intercept_)
print('\nParameter estimates:')
print(origparams07)


R-squared for raw data:
0.455671848126

Parameter estimates:
[  0.00000000e+00  -1.92562839e-03  -3.29436190e-04   1.41384097e-03
   1.66033440e-04  -1.11817561e-04   4.31359518e-04  -4.27184043e-04
   3.97530850e-06  -6.34341121e-03   9.24266838e-03  -2.56102254e-03
   1.61302644e-02  -1.09325140e-02  -1.77667720e-05  -2.53692479e-03
  -1.24546388e-03  -1.99059519e-16   2.22261445e-16  -3.64166845e-04
  -1.36349246e-03   6.18292775e-04   4.74534002e-03   1.10657089e-03
   7.30059820e-03]


In [13]:
# lasso regression model 

lass07 = linear_model.Lasso(alpha=.08)
lassfit07 = lass07.fit(X_train, Y_train)
print('R² for raw data:')
print(lass07.score(X_test, Y_test))

lassparams07 = np.append(lassfit07.coef_, lassfit07.intercept_)
print('\nParameter estimates:')
print(lassparams07)

R² for raw data:
0.455029168363

Parameter estimates:
[  0.00000000e+00  -0.00000000e+00  -0.00000000e+00   0.00000000e+00
   1.52888673e-04  -9.91426455e-05   4.31211994e-04  -4.25570547e-04
   3.67273007e-06   9.13103513e-04  -3.57342840e-04  -0.00000000e+00
   5.40884905e-03  -3.75050929e-04  -4.53144005e-05   0.00000000e+00
   1.17777293e-03   0.00000000e+00   0.00000000e+00  -0.00000000e+00
  -3.15705459e-04   8.76551504e-04   0.00000000e+00   1.12165497e-03
  -7.65963228e-03]


In [14]:
# ridge regression model

ridgeregr07 = linear_model.Ridge(alpha=10, fit_intercept=False) 
ridgeregr07.fit(X_train, Y_train)
print(ridgeregr07.score(X_test, Y_test))

ridgeparams07 = ridgeregr07.coef_[0]
print(ridgeparams07)

0.455671773203
[  3.63757375e-06  -1.92562431e-03  -3.29436230e-04   1.41383339e-03
   1.66033569e-04  -1.11817720e-04   4.31359067e-04  -4.27183568e-04
   3.97530347e-06  -3.77410396e-03   5.81692852e-03  -1.70458987e-03
   1.27045241e-02  -7.50677433e-03  -1.77667879e-05  -1.68048834e-03
  -3.89030722e-04   0.00000000e+00   0.00000000e+00  -3.64166174e-04
  -1.36349033e-03   6.18292444e-04   4.74533831e-03   1.10657129e-03]


# 2007 Airline data with feature engineering using linear, lasso & ridge regression models

In [15]:
# New features to capture potential relationships between features.

y2007_train['ArrDist'] = y2007_train['ArrTime'] * y2007_train['Distance']
y2007_train['DepDist'] = y2007_train['DepTime'] * y2007_train['Distance']
y2007_train['ArrTaxiIn'] = y2007_train['ArrTime'] * y2007_train['TaxiIn']
y2007_train['DepTaxiOut'] = y2007_train['DepTime'] * y2007_train['TaxiOut']
y2007_train['ArrWeather'] = y2007_train['ArrTime'] * y2007_train['WeatherDelay']
y2007_train['DepWeather'] = y2007_train['DepTime'] * y2007_train['WeatherDelay']
y2007_train['ArrSecurity'] = y2007_train['ArrTime'] * y2007_train['SecurityDelay']
y2007_train['DepSecurity'] = y2007_train['DepTime'] * y2007_train['SecurityDelay']
y2007_train['ArrAircraft'] = y2007_train['ArrTime'] * y2007_train['LateAircraftDelay']
y2007_train['DepAircraft'] = y2007_train['DepTime'] * y2007_train['LateAircraftDelay']

X_train2 = y2007_train.drop(['late'], 1)

y2007_test['ArrDist'] = y2007_test['ArrTime'] * y2007_test['Distance']
y2007_test['DepDist'] = y2007_test['DepTime'] * y2007_test['Distance']
y2007_test['ArrTaxiIn'] = y2007_test['ArrTime'] * y2007_test['TaxiIn']
y2007_test['DepTaxiOut'] = y2007_test['DepTime'] * y2007_test['TaxiOut']
y2007_test['ArrWeather'] = y2007_test['ArrTime'] * y2007_test['WeatherDelay']
y2007_test['DepWeather'] = y2007_test['DepTime'] * y2007_test['WeatherDelay']
y2007_test['ArrSecurity'] = y2007_test['ArrTime'] * y2007_test['SecurityDelay']
y2007_test['DepSecurity'] = y2007_test['DepTime'] * y2007_test['SecurityDelay']
y2007_test['ArrAircraft'] = y2007_test['ArrTime'] * y2007_test['LateAircraftDelay']
y2007_test['DepAircraft'] = y2007_test['DepTime'] * y2007_test['LateAircraftDelay']

X_test2 = y2007_test.drop(['late'], 1)

# Re-run linear regression model with new features

regrBig = linear_model.LinearRegression()
regrBig.fit(X_train2, Y_train)
print('\nR-squared for feature engineered data:')
print(regrBig.score(X_test2, Y_test))

# Store the new parameter estimates for the same features.
newparams07 = np.append(
    regrBig.coef_[0,0:(len(origparams07)-1)],
    regrBig.intercept_)
print('\nParameter Estimates for raw data and feature engineered data:')

compare = np.column_stack((origparams07, newparams07))
prettycompare = np.array2string(compare, formatter={'float_kind':'{0:.3f}'.format})
print(prettycompare)


R-squared for feature engineered data:
0.462444607535

Parameter Estimates for raw data and feature engineered data:
[[0.000 -0.000]
 [-0.002 -0.002]
 [-0.000 -0.000]
 [0.001 0.001]
 [0.000 0.000]
 [-0.000 -0.000]
 [0.000 0.000]
 [-0.000 -0.000]
 [0.000 0.000]
 [-0.006 0.003]
 [0.009 -0.002]
 [-0.003 -0.000]
 [0.016 0.005]
 [-0.011 0.000]
 [-0.000 0.000]
 [-0.003 -0.001]
 [-0.001 0.004]
 [-0.000 -0.000]
 [0.000 -0.000]
 [-0.000 -0.000]
 [-0.001 -0.002]
 [0.001 0.001]
 [0.005 0.008]
 [0.001 0.001]
 [0.007 -0.040]]


In [16]:
# lasso regression feature engineered data

lassBig = linear_model.Lasso(alpha=.08)
lassBig.fit(X_train2, Y_train)
print('\nR² for feature engineered data:')
print(lassBig.score(X_test2, Y_test))

lassparamsBig = np.append(lassBig.coef_, lassBig.intercept_)
print('\nParameter estimates for feature engineered data:')
print(lassparamsBig)


R² for feature engineered data:
0.459370654447

Parameter estimates for feature engineered data:
[  0.00000000e+00  -0.00000000e+00  -0.00000000e+00   0.00000000e+00
   2.04693684e-04  -1.01669977e-04   3.32343881e-04  -3.85585897e-04
   3.78396765e-06   8.14182243e-04  -3.23815642e-04   0.00000000e+00
   5.28579720e-03  -2.71525141e-04  -3.19140096e-06  -0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00  -0.00000000e+00
  -0.00000000e+00   7.95293505e-04   0.00000000e+00   0.00000000e+00
   1.92071211e-08  -4.52708293e-08   5.53519724e-07   1.10512645e-06
   7.04203755e-07  -1.29189347e-06   3.07876838e-06  -0.00000000e+00
   1.74556275e-06  -8.63352997e-07   2.57944355e-03]


In [17]:
ridgeregrBig = linear_model.Ridge(alpha=10, fit_intercept=False)
ridgeregrBig.fit(X_train2, Y_train)
print(ridgeregrBig.score(X_test2, Y_test))
ridgeparamsBig = ridgeregrBig.coef_[0, 0:len(ridgeparams07)]
print('\nParameter Estimates for the raw data and feature engineered data:')

compare = np.column_stack((ridgeparams07, ridgeparamsBig))
prettycompare = np.array2string(compare, formatter={'float_kind':'{0:.3f}'.format})
print(prettycompare)

0.462444610106

Parameter Estimates for the raw data and feature engineered data:
[[0.000 -0.000]
 [-0.002 -0.002]
 [-0.000 -0.000]
 [0.001 0.001]
 [0.000 0.000]
 [-0.000 -0.000]
 [0.000 0.000]
 [-0.000 -0.000]
 [0.000 0.000]
 [-0.004 0.003]
 [0.006 -0.002]
 [-0.002 -0.000]
 [0.013 0.005]
 [-0.008 0.000]
 [-0.000 0.000]
 [-0.002 -0.001]
 [-0.000 0.004]
 [0.000 0.000]
 [0.000 0.000]
 [-0.000 -0.000]
 [-0.001 -0.002]
 [0.001 0.001]
 [0.005 0.008]
 [0.001 0.001]]


# Model accuracy comparison between raw data & feature engineered data

In [18]:
# linear regression accuracy comparison between raw data & feature engineered data

print(regr07.score(X_test, Y_test))
print(regrBig.score(X_test2, Y_test))

0.455671848126
0.462444607535


In [19]:
# linear regression cross validation scores for raw data

from sklearn.model_selection import cross_val_score
cross_val_score(regr07, X_test, Y_test, cv=10)

array([ 0.46124522,  0.42896775,  0.44804776,  0.4582674 ,  0.43872102,
        0.46245763,  0.43973763,  0.46029888,  0.47911415,  0.42077351])

In [20]:
# linear regression cross validation scores for feature engineered data

cross_val_score(regrBig, X_test2, Y_test, cv=10)

array([ 0.46896519,  0.43701738,  0.45426426,  0.46563443,  0.44412255,
        0.47010052,  0.44812356,  0.46355955,  0.48383079,  0.42819987])

**The linear regression model with feature engineered data performed slightly better than the raw data.**

In [21]:
# lasso regression accuracy comparison between raw data & feature engineered data

print(lass07.score(X_test, Y_test))
print(lassBig.score(X_test2, Y_test))

0.455029168363
0.459370654447


In [22]:
# cross validation scores for lasso regression with raw data

cross_val_score(lass07, X_test, Y_test, cv=10)

array([ 0.46484485,  0.42601282,  0.44796706,  0.45941506,  0.43562219,
        0.46364741,  0.43680399,  0.46028614,  0.4772016 ,  0.41502799])

In [24]:
# cross validation scores for lasso regression with feature engineered data

cross_val_score(lassBig, X_test2, Y_test, cv=10)

array([ 0.469555  ,  0.43258243,  0.45119391,  0.4647272 ,  0.44022532,
        0.46819195,  0.44314157,  0.46282639,  0.47991883,  0.42013241])

**The lasso regression model using feature engineered data performed slightly better than the model using raw data.**  

In [25]:
# ridge regression accuracy comparison between raw data & feature engineered data

print(ridgeregr07.score(X_test, Y_test))
print(ridgeregrBig.score(X_test2, Y_test))

0.455671773203
0.462444610106


In [26]:
# cross validation scores for ridge regression with raw data

cross_val_score(ridgeregr07, X_test, Y_test, cv=10)

array([ 0.46124868,  0.4289722 ,  0.44805086,  0.45826291,  0.43871652,
        0.46245759,  0.43973354,  0.46029898,  0.479113  ,  0.42077508])

In [27]:
# cross validation scores for ridge regression with feature engineered data

cross_val_score(ridgeregrBig, X_test2, Y_test, cv=10)

array([ 0.46896957,  0.43701796,  0.45426827,  0.46563064,  0.44411845,
        0.4701004 ,  0.44811999,  0.46355935,  0.48382895,  0.4281996 ])

**The ridge regression model using feature engineered data performed slightly better than the model using raw data.**

# 2008 Airline Data Comparison

In [28]:
y2008 = pd.DataFrame(initial_cleaning('/Users/jamilab/Data_Science/Data_Sets/2008.csv.bz2'))

   Year  Month  DayofMonth  DayOfWeek  DepTime  CRSDepTime  ArrTime  \
0  2008      1           3          4 2003.000        1955 2211.000   
1  2008      1           3          4  754.000         735 1002.000   
2  2008      1           3          4  628.000         620  804.000   
3  2008      1           3          4  926.000         930 1054.000   
4  2008      1           3          4 1829.000        1755 1959.000   

   CRSArrTime UniqueCarrier  FlightNum        ...         TaxiIn  TaxiOut  \
0        2225            WN        335        ...          4.000    8.000   
1        1000            WN       3231        ...          5.000   10.000   
2         750            WN        448        ...          3.000   17.000   
3        1100            WN       1746        ...          3.000    7.000   
4        1925            WN       3920        ...          3.000   10.000   

   Cancelled  CancellationCode  Diverted  CarrierDelay WeatherDelay NASDelay  \
0          0               NaN

In [29]:
y2008 = y2008.drop(['CancellationCode', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'], 1)

In [30]:
y2008 = y2008.drop(['Origin', 'Dest', 'TailNum', 'UniqueCarrier'], 1)

In [31]:
y2008 = y2008.dropna()

In [32]:
y2008.shape

(6855025, 19)

In [33]:
# determine if the flight arrived late

y2008['late'] = np.where(y2008['ArrTime'] - y2008['CRSArrTime'] >30, 1, 0)

# 2008 Airline data modeled with linear, lasso & ridge regression using raw data

In [34]:
# Define the training and test sizes.

trainsize = int(y2008.shape[0] / 2)
y2008_test = y2008.iloc[trainsize:, :].copy()
y2008_train = y2008.iloc[:trainsize, :].copy()

# Set up the linear regression model to predict defaults using all other variables as features.
regr08 = linear_model.LinearRegression()
Y_train08 = y2008_train['late'].values.reshape(-1, 1)
X_train08 = y2008_train.drop(['late'], 1)

Y_test08 = y2008_test['late'].values.reshape(-1, 1)
X_test08 = y2008_test.drop(['late'], 1)

regr08.fit(X_train08, Y_train08)
print('\nR-squared simple model:')
print(regr08.score(X_test08, Y_test08))

#Store the parameter estimates.
origparams08 = np.append(regr08.coef_, regr08.intercept_)


R-squared simple model:
0.454240778376


In [35]:
# lasso regression model

lass08 = linear_model.Lasso(alpha=.08)
lassfit08 = lass08.fit(X_train08, Y_train08)
print('R² for raw data:')
print(lass08.score(X_test08, Y_test08))

lassparams08 = np.append(lassfit08.coef_, lassfit08.intercept_)
print('\nParameter estimates for raw data:')
print(lassparams08)

R² for raw data:
0.454397157291

Parameter estimates for raw data:
[  0.00000000e+00  -0.00000000e+00  -0.00000000e+00   0.00000000e+00
   1.57841981e-04  -1.04554308e-04   4.12320238e-04  -4.07750857e-04
   2.51460577e-06   1.35352281e-03  -3.06287857e-04  -0.00000000e+00
   5.72454206e-03  -1.35333038e-04  -1.03994478e-04   0.00000000e+00
   1.09757202e-03   0.00000000e+00   0.00000000e+00  -1.95423095e-02]


In [36]:
# ridge regression model

ridgeregr08 = linear_model.Ridge(alpha=10, fit_intercept=False) 
ridgeregr08.fit(X_train08, Y_train08)
print(ridgeregr08.score(X_test08, Y_test08))

ridgeparams08 = ridgeregr08.coef_[0]
print('\nParameter estimates for raw data:')
print(ridgeparams08)

0.454240823416

Parameter estimates for raw data:
[  1.84321799e-06  -2.17469313e-03  -3.36965991e-04   9.85854236e-04
   1.74881563e-04  -1.20945773e-04   4.11457081e-04  -4.06506267e-04
   2.83617071e-06  -3.32619326e-02   4.53681475e-02  -1.15511111e-02
   5.25417217e-02  -4.70485990e-02  -4.37808907e-05  -1.15479938e-02
  -1.01628174e-02   0.00000000e+00   0.00000000e+00]


**In terms of model accuracy and performance, there wasn't much difference in variance accounted for between the 3 models, with all models at approximately 45%.**

# 2008 Airline data with feature engineering using linear, lasso & ridge regression models

In [37]:
# New features to capture potential relationships between features.

y2008_train['ArrDist'] = y2008_train['ArrTime'] * y2008_train['Distance']
y2008_train['DepDist'] = y2008_train['DepTime'] * y2008_train['Distance']
y2008_train['ArrTaxiIn'] = y2008_train['ArrTime'] * y2008_train['TaxiIn']
y2008_train['DepTaxiOut'] = y2008_train['DepTime'] * y2008_train['TaxiOut']
y2008_train['ArrElapsedTime'] = y2008_train['ArrTime'] * y2008_train['ActualElapsedTime']
y2008_train['DepElapsedTime'] = y2008_train['DepTime'] * y2008_train['ActualElapsedTime']
y2008_train['ArrWeekDay'] = y2008_train['ArrTime'] * y2008_train['DayOfWeek']
y2008_train['DepWeekDay'] = y2008_train['DepTime'] * y2008_train['DayOfWeek']
y2008_train['ArrMonth'] = y2008_train['ArrTime'] * y2008_train['Month']
y2008_train['DepMonth'] = y2008_train['DepTime'] * y2008_train['Month']

X_train08_2 = y2008_train.drop(['late'], 1)

y2008_test['ArrDist'] = y2008_test['ArrTime'] * y2008_test['Distance']
y2008_test['DepDist'] = y2008_test['DepTime'] * y2008_test['Distance']
y2008_test['ArrTaxiIn'] = y2008_test['ArrTime'] * y2008_test['TaxiIn']
y2008_test['DepTaxiOut'] = y2008_test['DepTime'] * y2008_test['TaxiOut']
y2008_test['ArrElapsedTime'] = y2008_test['ArrTime'] * y2008_test['ActualElapsedTime']
y2008_test['DepElapsedTime'] = y2008_test['DepTime'] * y2008_test['ActualElapsedTime']
y2008_test['ArrWeekDay'] = y2008_test['ArrTime'] * y2008_test['DayOfWeek']
y2008_test['DepWeekDay'] = y2008_test['DepTime'] * y2008_test['DayOfWeek']
y2008_test['ArrMonth'] = y2008_test['ArrTime'] * y2008_test['Month']
y2008_test['DepMonth'] = y2008_test['DepTime'] * y2008_test['Month']

X_test08_2 = y2008_test.drop(['late'], 1)

# Re-run linear regression model with new features

regrBig08 = linear_model.LinearRegression()
regrBig08.fit(X_train08_2, Y_train08)
print('\nR-squared for feature engineered data:')
print(regrBig08.score(X_test08_2, Y_test08))

# Store the new parameter estimates for the feature engineered data
newparams08 = np.append(
    regrBig08.coef_[0,0:(len(origparams08)-1)],
    regrBig08.intercept_)
print('\nParameter Estimates for raw data and feature engineered data:')

compare = np.column_stack((origparams08, newparams08))
prettycompare = np.array2string(compare, formatter={'float_kind':'{0:.3f}'.format})
print(prettycompare)


R-squared for feature engineered data:
0.455984668064

Parameter Estimates for raw data and feature engineered data:
[[0.000 -0.000]
 [-0.002 -0.003]
 [-0.000 -0.000]
 [0.001 -0.001]
 [0.000 0.000]
 [-0.000 -0.000]
 [0.000 0.000]
 [-0.000 -0.000]
 [0.000 0.000]
 [-0.074 -0.104]
 [0.099 0.140]
 [-0.025 -0.036]
 [0.107 0.147]
 [-0.101 -0.141]
 [-0.000 0.000]
 [-0.025 -0.037]
 [-0.024 -0.031]
 [0.000 -0.000]
 [0.000 -0.000]
 [0.004 -0.008]]


In [38]:
# lasso regression feature engineered data

lassBig08 = linear_model.Lasso(alpha=.08)
lassBig08.fit(X_train08_2, Y_train08)
print('\nR² for feature engineered data:')
print(lassBig08.score(X_test08_2, Y_test08))

lassparamsBig08 = np.append(lassBig08.coef_, lassBig08.intercept_)
print('\nParameter estimates for feature engineered data:')
print(lassparamsBig08)




R² for feature engineered data:
0.455454141878

Parameter estimates for feature engineered data:
[  0.00000000e+00  -0.00000000e+00  -0.00000000e+00  -0.00000000e+00
   2.09111305e-04  -1.14848180e-04   3.24963334e-04  -3.97664516e-04
   2.46120298e-06   0.00000000e+00  -2.70960988e-04   0.00000000e+00
   5.57579751e-03  -1.44872016e-05   7.69067210e-05   0.00000000e+00
   0.00000000e+00   0.00000000e+00   0.00000000e+00  -1.87704157e-07
   6.75072247e-08   1.94347668e-07   9.17920030e-07   1.70251156e-06
  -8.74600796e-07  -3.56667802e-07   1.22072818e-06  -1.42603975e-06
   4.35774722e-07   3.64363774e-02]


In [39]:
ridgeregrBig08 = linear_model.Ridge(alpha=10, fit_intercept=False)
ridgeregrBig08.fit(X_train08_2, Y_train08)
print(ridgeregrBig08.score(X_test08_2, Y_test08))
ridgeparamsBig08 = ridgeregrBig08.coef_[0, 0:len(ridgeparams08)]
print('\nParameter Estimates for the raw data and feature engineered data:')

compare = np.column_stack((ridgeparams08, ridgeparamsBig08))
prettycompare = np.array2string(compare, formatter={'float_kind':'{0:.3f}'.format})
print(prettycompare)

0.455984524026

Parameter Estimates for the raw data and feature engineered data:
[[0.000 -0.000]
 [-0.002 -0.003]
 [-0.000 -0.000]
 [0.001 -0.001]
 [0.000 0.000]
 [-0.000 -0.000]
 [0.000 0.000]
 [-0.000 -0.000]
 [0.000 0.000]
 [-0.033 -0.047]
 [0.045 0.064]
 [-0.012 -0.017]
 [0.053 0.071]
 [-0.047 -0.066]
 [-0.000 0.000]
 [-0.012 -0.018]
 [-0.010 -0.012]
 [0.000 0.000]
 [0.000 0.000]]


# Model accuracy comparison between raw data & feature engineered data 

In [40]:
# linear regression accuracy comparison between raw data & feature engineered data

print(regr08.score(X_test08, Y_test08))
print(regrBig08.score(X_test08_2, Y_test08))

0.454240778376
0.455984668064


In [41]:
# linear regression cross validation scores for raw data

cross_val_score(regr08, X_test08, Y_test08, cv=10)

array([ 0.47855221,  0.42153833,  0.47418089,  0.4322856 ,  0.44477572,
        0.44720516,  0.40910252,  0.47595388,  0.4678902 ,  0.41008312])

In [42]:
# linear regression cross validation scores for feature engineered data

cross_val_score(regrBig08, X_test08_2, Y_test08, cv=10)

array([ 0.48238067,  0.42447666,  0.47532247,  0.43308534,  0.44510076,
        0.44685807,  0.40976333,  0.47782069,  0.47008957,  0.41346887])

In [43]:
# lasso regression accuracy comparison between raw and feature engineered data

print(lass08.score(X_test08, Y_test08))
print(lassBig08.score(X_test08_2, Y_test08))

0.454397157291
0.455454141878


In [44]:
# lasso regression cross validation scores for raw data

cross_val_score(lass08, X_test08, Y_test08, cv=10)

array([ 0.48174577,  0.42286147,  0.47512065,  0.43205495,  0.44411385,
        0.44811147,  0.40862419,  0.4758653 ,  0.46666177,  0.40892329])

In [45]:
# lasso regression cross validation scores for feature engineered data

cross_val_score(lassBig08, X_test08_2, Y_test08, cv=10)



array([ 0.48045737,  0.4234053 ,  0.47492624,  0.432588  ,  0.44538594,
        0.44736963,  0.40794609,  0.47467016,  0.46751945,  0.40886014])

In [46]:
# ridge regression accuracy comparison between raw & feature engineered data

print(ridgeregr08.score(X_test08, Y_test08))
print(ridgeregrBig08.score(X_test08_2, Y_test08))

0.454240823416
0.455984524026


In [47]:
# ridge regression cross validation scores for raw data

cross_val_score(ridgeregr08, X_test, Y_test, cv=10)

array([ 0.46124868,  0.4289722 ,  0.44805086,  0.45826291,  0.43871652,
        0.46245759,  0.43973354,  0.46029898,  0.479113  ,  0.42077508])

In [48]:
# ridge regression cross validation scores for feature engineered data

cross_val_score(ridgeregrBig08, X_test08_2, Y_test08, cv=10)

array([ 0.48238067,  0.42447666,  0.47532247,  0.43308534,  0.44510076,
        0.44685807,  0.40976333,  0.47782069,  0.47008958,  0.41346887])

**All 3 models performed slightly better with feature engineered data than with the raw data.  Accuracy was constant at 45% for all models, which were linear regression, lasso regression and ridge regression.**