Splitting the data into training set and test set !

In [119]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
SEED  = 0
np.random.seed(SEED)
def split_data():
    DATA = pd.read_csv('prostate.data', sep='\s+')
    train_data = DATA[DATA.iloc[:, -1] == 'T'].values
    test_data = DATA[DATA.iloc[:, -1] == 'F'].values
    y_train = train_data[:, -2]
    y_test = test_data[:, -2]
    X_train = train_data[:, :-2]
    X_test = test_data[:, :-2]
    return X_train, X_test, y_train, y_test,DATA
X_train, X_test, y_train,y_test,DATA = split_data()
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train) 
X_test_scaled = sc.transform(X_test)
#We will use in CV, kfold with k = 5, since the data is small so 5 splits is better than 10 !
kfold = KFold(n_splits=5, shuffle=True, random_state=SEED) # to make sure that when we use cross_validation the model is going to split them randomly, which cross_val_score doesn't do unless we specify the method we want like we are doing here !

First Step Lasso, and selecting the best $\lambda$ for this model !

In [120]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
MIN_STEP_1 = np.inf
MIN_LAMB_STEP_1 = None
lambda_range = np.logspace(-10, 1, 100)
for lamb in lambda_range:
    lasso_model = Lasso(alpha=lamb,fit_intercept=True,random_state=SEED)
    pipeline = Pipeline([('scaler', StandardScaler()), ('lasso', lasso_model)])
    SCORES_model = cross_val_score(pipeline, X_train, y_train, scoring='neg_mean_squared_error', cv=kfold) 
    SCORES_model = -SCORES_model
    MEAN = SCORES_model.mean()
    if MEAN<MIN_STEP_1:
        MIN_STEP_1 = MEAN
        MIN_LAMB_STEP_1 = lamb

#notice that the best lambda is consistent with question 1(if we look at the CV criterion !)
step_1_lasso_model = Lasso(alpha=MIN_LAMB_STEP_1, fit_intercept=True, random_state=SEED)
step_1_lasso_model.fit(X_train_scaled, y_train)
COEF_STEP_1 = step_1_lasso_model.coef_
y_pred = step_1_lasso_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(mse)
INDEX_OF_ZERO = np.where(COEF_STEP_1 == 0)[0]
## REMOVING THE INDECES OF ZERO FROM THE X_TRAIN_SCALED and X_TRAIN
X_train_scaled_modified = np.delete(X_train_scaled, INDEX_OF_ZERO, axis=1) # Removing the columns in the INDEX_OF_ZERO
X_train_modified = np.delete(X_train, INDEX_OF_ZERO, axis=1) # Removing the columns in the INDEX_OF_ZERO
## REMOVING THE INDECES OF ZERO FROM THE X_TEST_SCALED and X_TEST
X_test_scaled_modified = np.delete(X_test_scaled, INDEX_OF_ZERO, axis=1) # Removing the columns in the INDEX_OF_ZERO
X_test_modified = np.delete(X_test, INDEX_OF_ZERO, axis=1) # Removing the columns in the INDEX_OF_ZERO


0.49399783092864197


STEP 2 

In [121]:
MIN_STEP_2 = np.inf
MIN_LAMB_STEP_2 = None
lambda_range = np.logspace(-10, 0, 100)
for lamb in lambda_range:
    lasso_model = Lasso(alpha=lamb,fit_intercept=True,random_state=SEED)
    pipeline = Pipeline([('scaler', StandardScaler()), ('lasso', lasso_model)])
    SCORES_model = cross_val_score(pipeline, X_train_modified, y_train, scoring='neg_mean_squared_error', cv=kfold) 
    SCORES_model = -SCORES_model
    MEAN = SCORES_model.mean()
    if MEAN<MIN_STEP_2:
        MIN_STEP_2 = MEAN
        MIN_LAMB_STEP_2 = lamb
step_2_lasso_model = Lasso(alpha=MIN_LAMB_STEP_2, fit_intercept=True, random_state=SEED)
step_2_lasso_model.fit(X_train_scaled_modified, y_train)
COEF_STEP_2 = step_2_lasso_model.coef_

Comparison between the Lambda_1 and Lambda_2 and the coef! 

In [122]:
print(f"The lambda 1 value =  {MIN_LAMB_STEP_1}, as for the lambda 2 value = {MIN_LAMB_STEP_2}")
print(f"So we can clearly see that the lambda 1 is bigger than the lambda 2, difference = {MIN_LAMB_STEP_1-MIN_LAMB_STEP_2}") 

The lambda 1 value =  0.012915496650148827, as for the lambda 2 value = 3.2745491628777316e-09
So we can clearly see that the lambda 1 is bigger than the lambda 2, difference = 0.012915493375599664


To predict new Data we will create this function !

In [123]:
from sklearn.metrics import mean_squared_error
def predict(X_NEW):
    '''X_NEW should be full without any modification (i.e. without removing the variables from the first lasso the function does that automatically ), and unscaled !'''
    X_NEW_scaled = sc.transform(X_NEW)
    X_NEW_scaled_modified = np.delete(X_NEW_scaled, INDEX_OF_ZERO, axis=1) # removing the variables from the first Lasso 
    return step_2_lasso_model.predict(X_NEW_scaled_modified)

'''getting the test error from our test data'''
y_pred = predict(X_test)
MSE_ERROR = mean_squared_error(y_test, y_pred)
print(MSE_ERROR)


0.5165134813206721
