# Regularization

## Imports

In [1]:
from utilities.std_imports import *
from sklearn.linear_model import LinearRegression, Ridge, Lasso 
from sklearn.model_selection import train_test_split, cross_val_score 
from statistics import mean 

## Loading and cleaning the Data

In [9]:
# Loading the data into a Pandas DataFrame 
data = pd.read_csv('D:/data/csv/kc_house_data.csv') 
  
# Dropping the numerically non-sensical variables 
dropColumns = ['id', 'date', 'zipcode'] 
data = data.drop(dropColumns, axis = 1) 
  
# Separating the dependent and independent variables 
y = data['price'] 
X = data.drop('price', axis = 1) 
  
# Dividing the data into training and testing set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) 
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((16209, 17), (16209,), (5404, 17), (5404,))

## a) Linear Regression:

In [3]:
lr = LinearRegression() 
lr.fit(X_train, y_train) 
  
score = lr.score(X_test, y_test)
print('Score : ', '{:.2f}'.format(score)) 

Score :  0.70


## b) Ridge(L2) Regression:

In [10]:
scores = [] 
alphas = [] 
  
for i in range(1, 9): 
    alpha = i * 0.25
    ridge = Ridge(alpha = alpha) 
    ridge.fit(X_train, y_train) 
    scrs = cross_val_score(ridge, X, y, cv = 10) 
    avg_score = mean(scrs) * 100
    scores.append(avg_score) 
    alphas.append(alpha) 

# Loop to print the different values of cross-validation scores 
print('alpha\tscore')
for i in range(0, len(alphas)): 
    print(str(alphas[i]) + '\t' + '{:.5f}'.format(scores[i])) 

# Building and fitting the Ridge Regression model with the best alpha = 2
best_ridge = Ridge(alpha = 2) 
best_ridge.fit(X_train, y_train) 
  
# Evaluating the Ridge Regression model 
score = best_ridge.score(X_test, y_test) 
print('Final score : ', '{:.5f}'.format(score))

alpha	score
0.25	69.09030
0.5	69.09047
0.75	69.09063
1.0	69.09077
1.25	69.09088
1.5	69.09098
1.75	69.09106
2.0	69.09112
Final score :  0.70274


## c) Lasso(L1) Regression:

In [None]:
scores = [] 
lambdas = [] 
  
# Loop to compute the cross-validation scores 
for i in range(1, 9): 
    lamda = i * 0.25
    lasso = Lasso(alpha = lamda, tol = 0.0925, max_iter = 1e5) 
    lasso.fit(X_train, y_train) 
    scrs = cross_val_score(lasso, X, y, cv = 10) 
    avg_score = mean(scores) * 100
    scores.append(avg_score) 
    lambdas.append(lamda) 
  
# Loop to print the different values of cross-validation scores 
print('lambda\tscore')
for i in range(0, len(lambdas)): 
    print(str(lambdas[i]) + '\t' + '{:.5f}'.format(scores[i])) 

  positive)


In [None]:
# Building and fitting the Lasso Regression Model with the best lambda = 2
best_lasso = Lasso(alpha = 2, tol = 0.0925) 
best_lasso.fit(X_train, y_train) 
  
# Evaluating the Lasso Regression model 
score = best_lasso.score(X_test, y_test)
print('Final score : ', '{:.5f}'.format(score))

## Step 4: Comparing and Visualizing the results

In [None]:
# Building the two lists for visualization 
models = ['Linear', 'Ridge', 'Lasso'] 
scores = [lr.score(X_test, y_test), best_ridge.score(X_test, y_test), best_lasso.score(X_test, y_test)] 
  
# Building the dictionary to compare the scores 
mapping = {} 
mapping['Linear'] = linearModel.score(X_test, y_test) 
mapping['Ridge'] = ridgeModelChosen.score(X_test, y_test) 
mapping['Lasso'] = lassoModelChosen.score(X_test, y_test) 
  
# Printing the scores for different models 
for key, val in mapping.items(): 
    print(str(key)+' : '+str(val)) 

# Plotting the scores 
plt.bar(models, scores) 
plt.xlabel('\nRegression Models\n') 
plt.ylabel('Score') 
plt.show() 

## Credits & Links

https://www.geeksforgeeks.org/ml-implementing-l1-and-l2-regularization-using-sklearn/
https://www.bogotobogo.com/python/scikit-learn/scikit-learn_logistic_regression.php