# Load Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression as lm
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


Check sklearn version

In [None]:
import sklearn
print(sklearn.__version__)

1.5.2


# Load Data

In [None]:
myData = pd.read_csv("https://hastie.su.domains/ElemStatLearn/datasets/prostate.data", sep='\t')
myData = myData.drop(columns=[myData.columns[0], myData.columns[10]])
myData.head()

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa
0,-0.579818,2.769459,50,-1.386294,0,-1.386294,6,0,-0.430783
1,-0.994252,3.319626,58,-1.386294,0,-1.386294,6,0,-0.162519
2,-0.510826,2.691243,74,-1.386294,0,-1.386294,7,20,-0.162519
3,-1.203973,3.282789,58,-1.386294,0,-1.386294,6,0,-0.162519
4,0.751416,3.432373,62,-1.386294,0,-1.386294,6,0,0.371564


# Train/Test Split

In [None]:
X = myData.iloc[:, :-1]
Y = myData.iloc[:, -1]
X_train, X_test , Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)


# Full Model

Results agree with the one from https://liangfgithub.github.io/Python_W3_VarSel_RidgeLasso.html

In [None]:
full_model = lm()
full_model.fit(X_train, Y_train)
# averaged MSE on the test set
mean_squared_error(Y_test, full_model.predict(X_test))

0.4031994553977937

# Ridge Regression with Normalization = TRUE

Center/Scale the data first, and then call Ridge. Prediction error agrees with the ones from https://liangfgithub.github.io/Python_W3_VarSel_RidgeLasso.html

Note that to match the result from sklearn 1.0 with `normalize = TRUE`, we need to scale the old `alpha` by `sample size n`.

In [None]:
X_train.shape

(77, 8)

In [None]:
model = make_pipeline(
    StandardScaler(),
    Ridge(alpha = 0.26560877829466867 * 77)
)
model.fit(X_train, Y_train)

In [None]:
mean_squared_error(Y_test, model.predict(X_test))

0.5057873761573644

Alternatively, instead of using `make_pipeline`, we can just standardize both the train and test data before calling Ridge.

In [None]:
scaler = StandardScaler(with_mean=True, with_std=True)
newX_train = scaler.fit_transform(X_train)
newX_test = scaler.transform(X_test)

In [None]:
myridge = Ridge(alpha = 0.26560877829466867 * 77)
myridge.fit(newX_train, Y_train)

Next, let's compare the ridge coefficients. The direct output `myridge.coef_ ` won't match those from https://liangfgithub.github.io/Python_W3_VarSel_RidgeLasso.html, as they represent the coefficients for `newX`. However, once we scale the coefficients (similar to what we did in Coding Assignment 1, Part 1) to reflect the original X matrix, the coefficients indeed match.

In [None]:
myridge.coef_

array([ 0.39324695,  0.20921068, -0.03779262,  0.09719885,  0.28834127,
        0.08373455,  0.11298982,  0.07026156])

In [None]:
myridge.coef_ / scaler.scale_

array([ 0.32824306,  0.49154153, -0.00515117,  0.06812984,  0.69518028,
        0.06186817,  0.16494969,  0.0026527 ])

# Lasso

Lasso prediction error and coefficient estimation agree with the ones from  https://liangfgithub.github.io/Python_W3_VarSel_RidgeLasso.html

In [None]:
lasso_model = Lasso(alpha = 0.00572236765935022 * np.sqrt(77))
lasso_model.fit(newX_train, Y_train)
mean_squared_error(Y_test, lasso_model.predict(newX_test))

0.43426555165941333

In [None]:
lasso_model.coef_ / scaler.scale_

array([ 0.4289285 ,  0.4692587 , -0.        ,  0.05252947,  0.7751224 ,
        0.        ,  0.13352423,  0.00100985])