# **Regularization**

In [118]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LinearRegression as LR

# Regularization libraries
from sklearn.linear_model import Lasso as L1
from sklearn.linear_model import Ridge as L2

#### Suppressing warnings

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Importing the dataset

In [54]:
src = r"https://raw.githubusercontent.com/codebasics/py/master/ML/16_regularization/Melbourne_housing_FULL.csv"
df = pd.read_csv(src)

df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [55]:
df.shape

(34857, 21)

## **Preprocessing**

#### Dropping unimportant columns

In [56]:
df.drop(["Date", "Lattitude", "Longtitude", "Address", "Postcode", "YearBuilt"], axis=1, inplace=True)

df.shape

(34857, 15)

#### Unique value counts

In [57]:
df.nunique()

Unnamed: 0,0
Suburb,351
Rooms,12
Type,3
Price,2871
Method,9
SellerG,388
Distance,215
Bedroom2,15
Bathroom,11
Car,15


#### **Missing value imputation**

In [58]:
df.isna().sum()

Unnamed: 0,0
Suburb,0
Rooms,0
Type,0
Price,7610
Method,0
SellerG,0
Distance,1
Bedroom2,8217
Bathroom,8226
Car,8728


In [59]:
df.Landsize.fillna(df.Landsize.mean(), inplace=True)
df.BuildingArea.fillna(df.BuildingArea.mean(), inplace=True)
df[["Car", "Propertycount", "Distance", "Bedroom2", "Bathroom"]] = df[["Car", "Propertycount", "Distance", "Bedroom2", "Bathroom"]].fillna(0)

In [60]:
df.dropna(inplace=True)
df.isna().sum()

Unnamed: 0,0
Suburb,0
Rooms,0
Type,0
Price,0
Method,0
SellerG,0
Distance,0
Bedroom2,0
Bathroom,0
Car,0


#### **One Hot Encoding**

In [64]:
df = pd.get_dummies(df, drop_first=True, dtype=int)

df.head()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Propertycount,Suburb_Aberfeldie,...,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
1,2,1480000.0,2.5,2.0,1.0,1.0,202.0,160.2564,4019.0,0,...,0,1,0,0,1,0,0,0,0,0
2,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,4019.0,0,...,0,1,0,0,1,0,0,0,0,0
4,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,4019.0,0,...,0,1,0,0,1,0,0,0,0,0
5,3,850000.0,2.5,3.0,2.0,1.0,94.0,160.2564,4019.0,0,...,0,1,0,0,1,0,0,0,0,0
6,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,4019.0,0,...,0,1,0,0,1,0,0,0,0,0


In [68]:
X = df.drop("Price", axis=1)
y = df.Price

X.shape

(27244, 744)

## **Regression analysis**

#### Utility function

In [122]:
def reg_results(train_score, test_score):
  print(f"Training score : {round(train_score, 5)}\nTesting score : {round(test_score, 5)}")

#### Splitting the dataset

In [100]:
test_size = 0.3
random_state = 2

X_train, X_test, y_train, y_test = tts(X, y, shuffle=True, test_size=test_size, random_state=random_state)

X_train.shape, X_test.shape

((19070, 745), (8174, 745))

### **Simple** Linear Regression

#### Model training

In [109]:
lr = LR()
lr.fit(X_train, y_train)

#### Regression scores

In [116]:
train_score = lr.score(X_train, y_train)
test_score = lr.score(X_test, y_test)

reg_results(train_score, test_score)

Training score : 0.68278
Testing score : 0.13854

The model is overfitting.


### **Lasso** Regression

#### Model training

In [123]:
alpha = 50
max_iter = 100
tolerance = 0.1

l1_reg = L1(alpha=alpha, max_iter=max_iter, tol=tolerance)
l1_reg.fit(X_train, y_train)

#### Regression scores

In [124]:
train_score_lasso = l1_reg.score(X_train, y_train)
test_score_lasso = l1_reg.score(X_test, y_test)

reg_results(train_score_lasso, test_score_lasso)

Training score : 0.67674
Testing score : 0.66377


### **Ridge** Regression

#### Model training

In [125]:
alpha2 = 50
max_iter2 = 100
tolerance2 = 0.1

l2_reg = L2(alpha=alpha2, max_iter=max_iter2, tol=tolerance2)
l2_reg.fit(X_train, y_train)

#### Regression scores

In [126]:
train_score_ridge = l2_reg.score(X_train, y_train)
test_score_ridge = l2_reg.score(X_test, y_test)

reg_results(train_score_ridge, test_score_ridge)

Training score : 0.66224
Testing score : 0.66708
