# L1 and L2 Regularization
 (Lasso and Ridge Regression)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [33]:
data =pd.read_csv("MELBOURNE_HOUSE_PRICES_LESS.csv")
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Postcode,Regionname,Propertycount,Distance,CouncilArea
0,Abbotsford,49 Lithgow St,3,h,1490000.0,S,Jellis,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
1,Abbotsford,59A Turner St,3,h,1220000.0,S,Marshall,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
2,Abbotsford,119B Yarra St,3,h,1420000.0,S,Nelson,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
3,Aberfeldie,68 Vida St,3,h,1515000.0,S,Barry,1/04/2017,3040,Western Metropolitan,1543,7.5,Moonee Valley City Council
4,Airport West,92 Clydesdale Rd,2,h,670000.0,S,Nelson,1/04/2017,3042,Western Metropolitan,3464,10.4,Moonee Valley City Council


In [34]:
data.nunique()

Suburb             380
Address          57754
Rooms               14
Type                 3
Price             3417
Method               9
SellerG            476
Date               112
Postcode           225
Regionname           8
Propertycount      368
Distance           180
CouncilArea         34
dtype: int64

In [36]:
# let's use limited columns which makes more sense for serving our purpose
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 
               'Distance', 'CouncilArea',]
data = data[cols_to_use]

In [37]:
data.head()



Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea
0,Abbotsford,3,h,S,Jellis,Northern Metropolitan,4019,3.0,Yarra City Council
1,Abbotsford,3,h,S,Marshall,Northern Metropolitan,4019,3.0,Yarra City Council
2,Abbotsford,3,h,S,Nelson,Northern Metropolitan,4019,3.0,Yarra City Council
3,Aberfeldie,3,h,S,Barry,Western Metropolitan,1543,7.5,Moonee Valley City Council
4,Airport West,2,h,S,Nelson,Western Metropolitan,3464,10.4,Moonee Valley City Council


In [38]:
data.shape

(63023, 9)

In [39]:
data.isna().sum()

Suburb           0
Rooms            0
Type             0
Method           0
SellerG          0
Regionname       0
Propertycount    0
Distance         0
CouncilArea      0
dtype: int64

Drop NA values of Price, since it's our predictive variable we won't impute it

# Let's one hot encode the categorical features

In [43]:
data = pd.get_dummies(data, drop_first=True)
data.head()

Unnamed: 0,Rooms,Propertycount,Distance,Suburb_Aberfeldie,Suburb_Airport West,Suburb_Albanvale,Suburb_Albert Park,Suburb_Albion,Suburb_Alphington,Suburb_Altona,...,CouncilArea_Moreland City Council,CouncilArea_Murrindindi Shire Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
0,3,4019,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,3,4019,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,4019,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,3,1543,7.5,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,3464,10.4,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let's bifurcate our dataset into train and test dataset

In [47]:
X = data.drop('CouncilArea_Yarra Ranges Shire Council', axis=1)
y = data['CouncilArea_Yarra Ranges Shire Council']

In [48]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=2)

Let's train our Linear Regression Model on training dataset and check the accuracy on test set

In [49]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_X, train_y)
reg.score(test_X, test_y)

0.9882686063569261

In [50]:
reg.score(train_X, train_y)

1.0

Here training score is 68% but test score is 13.85% which is very low

# Normal Regression is clearly overfitting the data, let's try other models
Using Lasso (L1 Regularized) Regression Model

In [51]:
from sklearn import linear_model
lasso_reg = linear_model.Lasso(alpha=50, max_iter=100, tol=0.1)
lasso_reg.fit(train_X, train_y)

In [52]:
lasso_reg.score(test_X, test_y)


-0.0002605720030750547

In [53]:
lasso_reg.score(train_X, train_y)

0.0

# Using Ridge (L2 Regularized) Regression Model

In [54]:
from sklearn.linear_model import Ridge
ridge_reg= Ridge(alpha=50, max_iter=100, tol=0.1)
ridge_reg.fit(train_X, train_y)



In [55]:
ridge_reg.score(test_X, test_y)

0.780988811293843

In [56]:
ridge_reg.score(train_X, train_y)

0.8204074348653875

We see that Lasso and Ridge Regularizations prove to be beneficial when our Simple Linear Regression Model overfits. These results may not be that contrast but significant in most cases.Also that L1 & L2 Regularizations are used in Neural Networks too