## Implementing Lasso/Ridge
We will use Lasso and Ridge to help us with feature selection.

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [23]:
## first load the data
coffee = pd.read_csv('../data/one_hot_coffee.csv')
coffee = coffee.copy()

In [24]:
## next perform the train test split
coffee_train, coffee_test = train_test_split(coffee,
                                            shuffle=True,
                                            random_state=47,
                                            test_size = .2)

In [25]:
## make a baseline
baseline = coffee['rating'].mean()
print(baseline)

90.4599101988454


In [26]:
## import the LinearRegression object
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [27]:
predictors = ['region_africa_arabia', 'region_caribbean',
       'region_central_america', 'region_hawaii', 'region_asia_pacific',
       'region_south_america', 'type_espresso', 'type_organic',
       'type_fair_trade', 'type_decaffeinated', 'type_pod_capsule',
       'type_blend', 'type_estate', 'Dark', 'Light', 'Medium', 'Medium-Dark', 'Medium-Light', 'Very Dark']

We are keeping the first linear regression model for comparison to the Lasso and Ridge Methods

In [36]:
reg = LinearRegression(copy_X = True)

reg.fit(coffee_train[predictors], 
        coffee_train['rating'])

LinearRegression()

In [37]:
## make the predictions
preds = reg.predict(coffee_train[predictors])
preds_baseline = baseline * np.ones(len(coffee_train))

In [38]:
## check the mean squared error
mse = mean_squared_error(coffee_train['rating'], preds)
mse_baseline = mean_squared_error(coffee_train['rating'], preds_baseline)
print("The mean squared error for multiple linear regression is", mse)
print("The mean squared error for the baseline is", mse_baseline)

The mean squared error for multiple linear regression is 9.931262169506294
The mean squared error for the baseline is 15.565458489578745


Now we make the Lasso and Ridge Models.

In [59]:
## set values for alpha
alphas = [0.000001,.00001,.0001,.001,.01,.1,.25,.5,.75,1,5,10,25,50,100,1000,10000]

## These will hold our coefficient estimates
ridge_coefs = np.empty((len(alphas),len(predictors)))
lasso_coefs = np.empty((len(alphas),len(predictors)))

## These will hold our mse estimates
ridge_mse = np.empty(len(alphas))
lasso_mse = np.empty(len(alphas))

## for each alpha value
for i in range(len(alphas)):
    ## set up the ridge pipeline
    ## first scale
    ## then make polynomial features
    ## then fit the ridge regression model
    ## max_iter=5000000
    ridge_pipe = Pipeline([#('scale' ,StandardScaler()),
                              ('ridge',Ridge(alpha=alphas[i], max_iter=5000000))])
    
    ## set up the lasso pipeline
    ## same steps as with ridge
    lasso_pipe = Pipeline([#('scale' ,StandardScaler()),
                              ('lasso',Lasso(alpha=alphas[i], max_iter=5000000))])
    
    
    ## fit the ridge
    ridge_pipe.fit(coffee_train[predictors], 
        coffee_train['rating'])
    
    ## fit the lasso
    lasso_pipe.fit(coffee_train[predictors], 
        coffee_train['rating'])

    ## make the predictions
    ridge_preds = ridge_pipe.predict(coffee_train[predictors])
    lasso_preds = lasso_pipe.predict(coffee_train[predictors])
    
    ridge_mse[i] = mean_squared_error(coffee_train['rating'], ridge_preds)
    lasso_mse[i] = mean_squared_error(coffee_train['rating'], lasso_preds)
    
    # record the coefficients
    ridge_coefs[i,:] = ridge_pipe['ridge'].coef_
    lasso_coefs[i,:] = lasso_pipe['lasso'].coef_

In [60]:
print("Ridge Coefficients")

pd.DataFrame(np.round(ridge_coefs,8),
            columns = [str(i) for i in predictors],
            index = ["alpha=" + str(a) for a in alphas])

Ridge Coefficients


Unnamed: 0,region_africa_arabia,region_caribbean,region_central_america,region_hawaii,region_asia_pacific,region_south_america,type_espresso,type_organic,type_fair_trade,type_decaffeinated,type_pod_capsule,type_blend,type_estate,Dark,Light,Medium,Medium-Dark,Medium-Light,Very Dark
alpha=1e-06,1.881073,-1.733478,0.431393,0.931024,0.225973,0.078407,1.7595,-0.034572,0.096751,0.258168,-1.864663,0.223573,0.607377,-3.089036,2.948697,1.467998,-1.043889,2.825813,-3.109582
alpha=1e-05,1.881073,-1.733477,0.431393,0.931024,0.225973,0.078407,1.7595,-0.034572,0.096751,0.258168,-1.864663,0.223573,0.607377,-3.089036,2.948697,1.467998,-1.043889,2.825813,-3.109582
alpha=0.0001,1.881073,-1.733469,0.431393,0.931023,0.225973,0.078407,1.7595,-0.034571,0.096751,0.258167,-1.864662,0.223573,0.607377,-3.089035,2.948696,1.467998,-1.043889,2.825812,-3.109581
alpha=0.001,1.881074,-1.733382,0.431395,0.931012,0.225971,0.078409,1.759495,-0.03457,0.096749,0.258156,-1.864652,0.223572,0.607377,-3.089025,2.948683,1.467993,-1.043891,2.825806,-3.109566
alpha=0.01,1.881081,-1.732519,0.431414,0.930903,0.225958,0.078428,1.759447,-0.034558,0.096726,0.258052,-1.864552,0.223566,0.607375,-3.088924,2.948556,1.467944,-1.043909,2.825748,-3.109415
alpha=0.1,1.881153,-1.723934,0.431605,0.929811,0.225824,0.078623,1.758969,-0.034432,0.096497,0.257007,-1.863556,0.2235,0.607358,-3.087915,2.947288,1.467458,-1.044085,2.825168,-3.107913
alpha=0.25,1.881269,-1.709817,0.43192,0.927995,0.225597,0.078945,1.758172,-0.034223,0.096115,0.255277,-1.861898,0.22339,0.60733,-3.086236,2.945176,1.466648,-1.044377,2.824203,-3.105415
alpha=0.5,1.881455,-1.686805,0.432433,0.924977,0.225213,0.079469,1.756841,-0.033875,0.095481,0.252426,-1.859141,0.223202,0.607291,-3.083441,2.941665,1.465302,-1.04486,2.822597,-3.101263
alpha=0.75,1.88163,-1.664418,0.432932,0.921971,0.224821,0.07998,1.755508,-0.033529,0.094848,0.249614,-1.856393,0.223007,0.607258,-3.080651,2.938164,1.46396,-1.04534,2.820996,-3.097128
alpha=1,1.881796,-1.642631,0.433418,0.918977,0.224421,0.080479,1.754171,-0.033184,0.094218,0.246842,-1.853653,0.222807,0.607233,-3.077868,2.934672,1.462622,-1.045815,2.819398,-3.093008


In [61]:
print("Lasso Coefficients")

pd.DataFrame(np.round(lasso_coefs,8),
            columns = [str(i) for i in predictors],
            index = ["alpha=" + str(a) for a in alphas])

Lasso Coefficients


Unnamed: 0,region_africa_arabia,region_caribbean,region_central_america,region_hawaii,region_asia_pacific,region_south_america,type_espresso,type_organic,type_fair_trade,type_decaffeinated,type_pod_capsule,type_blend,type_estate,Dark,Light,Medium,Medium-Dark,Medium-Light,Very Dark
alpha=1e-06,1.881059,-1.733286,0.431378,0.930962,0.225949,0.078386,1.759487,-0.034527,0.096702,0.258055,-1.864633,0.223554,0.607375,-4.40748,1.630227,0.149535,-2.362342,1.507353,-4.428021
alpha=1e-05,1.880936,-1.731552,0.431248,0.9304,0.225738,0.078203,1.759368,-0.034124,0.096261,0.25704,-1.864361,0.223388,0.607353,-4.407366,1.630112,0.149477,-2.362306,1.507325,-4.427856
alpha=0.0001,1.879699,-1.71422,0.429945,0.92478,0.223621,0.076369,1.758177,-0.030094,0.091846,0.246884,-1.861647,0.221729,0.607136,-4.406218,1.628954,0.148896,-2.361943,1.507049,-4.42621
alpha=0.001,1.867806,-1.540518,0.417186,0.869024,0.202867,0.058721,1.746357,-0.0,0.055696,0.146217,-1.834803,0.205193,0.605028,-4.394316,1.617513,0.143102,-2.358282,1.50429,-4.409872
alpha=0.01,1.778886,-0.0,0.319172,0.353566,0.017605,0.0,1.637223,0.0,0.0,0.0,-1.599597,0.04297,0.572975,-4.267247,1.509792,0.088099,-2.330645,1.478362,-4.242951
alpha=0.1,1.396951,-0.0,0.0,0.0,-0.0,0.0,0.539974,-0.0,-0.0,-0.0,-0.0,-0.0,0.110903,-2.535805,0.765107,0.0,-1.560849,1.537958,-2.04838
alpha=0.25,0.909523,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,-0.322282,1.506155,-0.0
alpha=0.5,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.656982,-0.0
alpha=0.75,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0
alpha=1,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0


In [64]:
ridge_mse

array([ 9.93090465,  9.93090465,  9.93090465,  9.93090465,  9.93090466,
        9.93090561,  9.9309106 ,  9.93092813,  9.93095677,  9.93099609,
        9.93282083,  9.9374467 ,  9.96145701, 10.02301419, 10.18468621,
       12.32553232, 14.89728647])

In [65]:
lasso_mse

array([ 9.93090465,  9.93090472,  9.93091121,  9.93152269,  9.96890211,
       10.97326893, 13.33703348, 14.80837099, 15.56493585, 15.56493585,
       15.56493585, 15.56493585, 15.56493585, 15.56493585, 15.56493585,
       15.56493585, 15.56493585])

## Summary
So it appears both organic and fair trade have little effect on our models.

In [66]:
predictors = ['region_africa_arabia', 'region_caribbean',
       'region_central_america', 'region_hawaii', 'region_asia_pacific',
       'region_south_america', 'type_espresso',
       'type_decaffeinated', 'type_pod_capsule',
       'type_blend', 'type_estate', 'Dark', 'Light', 'Medium', 'Medium-Dark', 'Medium-Light', 'Very Dark']

In [67]:
reg = LinearRegression(copy_X = True)

reg.fit(coffee_train[predictors], 
        coffee_train['rating'])

## make the predictions
preds = reg.predict(coffee_train[predictors])
preds_baseline = baseline * np.ones(len(coffee_train))

## check the mean squared error
mse = mean_squared_error(coffee_train['rating'], preds)
mse_baseline = mean_squared_error(coffee_train['rating'], preds_baseline)
print("The mean squared error for multiple linear regression is", mse)
print("The mean squared error for the baseline is", mse_baseline)

The mean squared error for multiple linear regression is 9.93170332430646
The mean squared error for the baseline is 15.565458489578745
