# Regularization

Thanks for the Dataset: [Dataset](https://github.com/codebasics/py/blob/master/ML/16_regularization/Melbourne_housing_FULL.csv)

## Import Necessary Libraries

In [88]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge

## Data Preparation

In [67]:
dataframe = pd.read_csv('../data/Melbourne_housing_FULL.csv')
dataframe.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [68]:
dataframe = dataframe.drop(columns=['Address', 'Date', 'Postcode', 'YearBuilt', 'Lattitude', 'Longtitude'])
dataframe.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Regionname,Propertycount
0,Abbotsford,2,h,,SS,Jellis,2.5,2.0,1.0,1.0,126.0,,Yarra City Council,Northern Metropolitan,4019.0
1,Abbotsford,2,h,1480000.0,S,Biggin,2.5,2.0,1.0,1.0,202.0,,Yarra City Council,Northern Metropolitan,4019.0
2,Abbotsford,2,h,1035000.0,S,Biggin,2.5,2.0,1.0,0.0,156.0,79.0,Yarra City Council,Northern Metropolitan,4019.0
3,Abbotsford,3,u,,VB,Rounds,2.5,3.0,2.0,1.0,0.0,,Yarra City Council,Northern Metropolitan,4019.0
4,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3.0,2.0,0.0,134.0,150.0,Yarra City Council,Northern Metropolitan,4019.0


In [69]:
dataframe.shape

(34857, 15)

In [70]:
dataframe.isna().sum()

Suburb               0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Distance             1
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
CouncilArea          3
Regionname           3
Propertycount        3
dtype: int64

In [71]:
columns_fill_zero = ['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']

dataframe[columns_fill_zero] = dataframe[columns_fill_zero].fillna(0)

In [72]:
dataframe.isna().sum()

Suburb               0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Distance             0
Bedroom2             0
Bathroom             0
Car                  0
Landsize         11810
BuildingArea     21115
CouncilArea          3
Regionname           3
Propertycount        0
dtype: int64

In [73]:
dataframe['Landsize'] = dataframe['Landsize'].fillna(dataframe['Landsize'].mean())
dataframe['BuildingArea'] = dataframe['BuildingArea'].fillna(dataframe['BuildingArea'].mean())


In [74]:
dataframe.isna().sum()

Suburb              0
Rooms               0
Type                0
Price            7610
Method              0
SellerG             0
Distance            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
CouncilArea         3
Regionname          3
Propertycount       0
dtype: int64

In [75]:
dataframe = dataframe.dropna()
dataframe.isna().sum()

Suburb           0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Distance         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
CouncilArea      0
Regionname       0
Propertycount    0
dtype: int64

In [76]:
dataframe.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Regionname,Propertycount
1,Abbotsford,2,h,1480000.0,S,Biggin,2.5,2.0,1.0,1.0,202.0,160.2564,Yarra City Council,Northern Metropolitan,4019.0
2,Abbotsford,2,h,1035000.0,S,Biggin,2.5,2.0,1.0,0.0,156.0,79.0,Yarra City Council,Northern Metropolitan,4019.0
4,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3.0,2.0,0.0,134.0,150.0,Yarra City Council,Northern Metropolitan,4019.0
5,Abbotsford,3,h,850000.0,PI,Biggin,2.5,3.0,2.0,1.0,94.0,160.2564,Yarra City Council,Northern Metropolitan,4019.0
6,Abbotsford,4,h,1600000.0,VB,Nelson,2.5,3.0,1.0,2.0,120.0,142.0,Yarra City Council,Northern Metropolitan,4019.0


In [77]:
dataframe = pd.get_dummies(dataframe, drop_first=True)
dataframe.head()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Propertycount,Suburb_Aberfeldie,...,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
1,2,1480000.0,2.5,2.0,1.0,1.0,202.0,160.2564,4019.0,False,...,False,True,False,False,True,False,False,False,False,False
2,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,4019.0,False,...,False,True,False,False,True,False,False,False,False,False
4,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,4019.0,False,...,False,True,False,False,True,False,False,False,False,False
5,3,850000.0,2.5,3.0,2.0,1.0,94.0,160.2564,4019.0,False,...,False,True,False,False,True,False,False,False,False,False
6,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,4019.0,False,...,False,True,False,False,True,False,False,False,False,False


## Assigning X and y

In [78]:
X = dataframe.drop(columns=['Price'])
X.head()

Unnamed: 0,Rooms,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Propertycount,Suburb_Aberfeldie,Suburb_Airport West,...,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
1,2,2.5,2.0,1.0,1.0,202.0,160.2564,4019.0,False,False,...,False,True,False,False,True,False,False,False,False,False
2,2,2.5,2.0,1.0,0.0,156.0,79.0,4019.0,False,False,...,False,True,False,False,True,False,False,False,False,False
4,3,2.5,3.0,2.0,0.0,134.0,150.0,4019.0,False,False,...,False,True,False,False,True,False,False,False,False,False
5,3,2.5,3.0,2.0,1.0,94.0,160.2564,4019.0,False,False,...,False,True,False,False,True,False,False,False,False,False
6,4,2.5,3.0,1.0,2.0,120.0,142.0,4019.0,False,False,...,False,True,False,False,True,False,False,False,False,False


In [79]:
y = dataframe['Price']
y[:5]

1    1480000.0
2    1035000.0
4    1465000.0
5     850000.0
6    1600000.0
Name: Price, dtype: float64

## Splitting the Data into Training and Test Data

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Finding the best Model with Regularization

In [81]:
model_list = {
    'linear_regression':{
        'model': LinearRegression(),
        'params': {
            'fit_intercept': [True, False]
        }
    },
    'lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
        }
    },
    'ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
        }
    }
}

In [82]:
scores = []

for model_name, model_params in model_list.items():
    grid_search = GridSearchCV(model_params['model'], model_params['params'], cv=5, return_train_score=False)
    grid_search.fit(X_train, y_train)

    scores.append({
        'model': model_name,
        'model_params': grid_search.best_params_,
        'best_score': grid_search.best_score_
    })

In [83]:
score_data_frame = pd.DataFrame(scores, columns=['model', 'model_params', 'best_score'])
score_data_frame

Unnamed: 0,model,model_params,best_score
0,linear_regression,{'fit_intercept': False},0.637609
1,lasso,{'alpha': 100},0.640753
2,ridge,{'alpha': 10},0.642849


## Model Creation

In [84]:
model = Ridge(alpha=10)

## Model Training

In [85]:
model.fit(X_train,y_train)

## Model Prediction

In [87]:
y_pred = model.predict(X_test)
y_pred

array([ 540301.12619142,  780712.86501705, 1347144.60532067, ...,
       1095173.94486961,  597740.19550892, 1334749.37786245])

## Model Evaluation

In [89]:
score = model.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)

In [94]:
print(f'Score: {score * 100: .2f}%')
print(f'MSE: {mse:.2f}')

Score:  66.97%
MSE: 134093278143.35
