# Regularization

## Import Necessary Libraries

In [39]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge

## Data Preparation

In [40]:
dataframe = pd.read_csv('../data/data.csv')
dataframe.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [41]:
dataframe.shape

(4600, 18)

In [42]:
dataframe.nunique()

date               70
price            1741
bedrooms           10
bathrooms          26
sqft_living       566
sqft_lot         3113
floors              6
waterfront          2
view                5
condition           5
sqft_above        511
sqft_basement     207
yr_built          115
yr_renovated       60
street           4525
city               44
statezip           77
country             1
dtype: int64

## Dropping of Tables

- Date
- Year Built
- Year Renovated
- Street
- StateZip
- Country

In [43]:
new_dataframe = dataframe.drop(columns=['date', 'yr_built', 'yr_renovated', 'street', 'statezip', 'country'])
new_dataframe.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,city
0,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,Shoreline
1,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,Seattle
2,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,Kent
3,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,Bellevue
4,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,Redmond


In [44]:
new_dataframe.isna().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
city             0
dtype: int64

In [45]:
new_dataframe.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement
count,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0
mean,551963.0,3.40087,2.160815,2139.346957,14852.52,1.512065,0.007174,0.240652,3.451739,1827.265435,312.081522
std,563834.7,0.908848,0.783781,963.206916,35884.44,0.538288,0.084404,0.778405,0.67723,862.168977,464.137228
min,0.0,0.0,0.0,370.0,638.0,1.0,0.0,0.0,1.0,370.0,0.0
25%,322875.0,3.0,1.75,1460.0,5000.75,1.0,0.0,0.0,3.0,1190.0,0.0
50%,460943.5,3.0,2.25,1980.0,7683.0,1.5,0.0,0.0,3.0,1590.0,0.0
75%,654962.5,4.0,2.5,2620.0,11001.25,2.0,0.0,0.0,4.0,2300.0,610.0
max,26590000.0,9.0,8.0,13540.0,1074218.0,3.5,1.0,4.0,5.0,9410.0,4820.0


In [46]:
(new_dataframe['price'] == 0).sum()

49

In [47]:
new_dataframe[new_dataframe['price'] == 0] = new_dataframe['price'].mean()
new_dataframe

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,city
0,3.130000e+05,3.0,1.50,1340.0,7912.0,1.5,0.0,0.0,3.0,1340.0,0.0,Shoreline
1,2.384000e+06,5.0,2.50,3650.0,9050.0,2.0,0.0,4.0,5.0,3370.0,280.0,Seattle
2,3.420000e+05,3.0,2.00,1930.0,11947.0,1.0,0.0,0.0,4.0,1930.0,0.0,Kent
3,4.200000e+05,3.0,2.25,2000.0,8030.0,1.0,0.0,0.0,4.0,1000.0,1000.0,Bellevue
4,5.500000e+05,4.0,2.50,1940.0,10500.0,1.0,0.0,0.0,4.0,1140.0,800.0,Redmond
...,...,...,...,...,...,...,...,...,...,...,...,...
4595,3.081667e+05,3.0,1.75,1510.0,6360.0,1.0,0.0,0.0,4.0,1510.0,0.0,Seattle
4596,5.343333e+05,3.0,2.50,1460.0,7573.0,2.0,0.0,0.0,3.0,1460.0,0.0,Bellevue
4597,4.169042e+05,3.0,2.50,3010.0,7014.0,2.0,0.0,0.0,3.0,3010.0,0.0,Renton
4598,2.034000e+05,4.0,2.00,2090.0,6630.0,1.0,0.0,0.0,3.0,1070.0,1020.0,Seattle


In [48]:
new_dataframe.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement
count,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0
mean,557842.6,5882.964225,5881.737812,7989.263573,20556.86,5881.101834,5879.612269,5879.837921,5883.018355,7682.416834,6186.452486
std,560918.0,56669.335946,56669.463215,56458.762508,65732.62,56669.529211,56669.683788,56669.660376,56669.330326,56488.991876,56639.705592
min,7800.0,0.0,0.0,370.0,638.0,1.0,0.0,0.0,1.0,370.0,0.0
25%,328158.9,3.0,1.75,1470.0,5002.75,1.0,0.0,0.0,3.0,1190.0,0.0
50%,468750.0,3.0,2.25,1980.0,7700.0,1.5,0.0,0.0,3.0,1600.0,0.0
75%,654962.5,4.0,2.5,2632.5,11200.0,2.0,0.0,0.0,4.0,2320.0,620.0
max,26590000.0,551962.988473,551962.988473,551962.988473,1074218.0,551962.988473,551962.988473,551962.988473,551962.988473,551962.988473,551962.988473


In [51]:
new_dataframe.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,city
0,313000.0,3.0,1.5,1340.0,7912.0,1.5,0.0,0.0,3.0,1340.0,0.0,Shoreline
1,2384000.0,5.0,2.5,3650.0,9050.0,2.0,0.0,4.0,5.0,3370.0,280.0,Seattle
2,342000.0,3.0,2.0,1930.0,11947.0,1.0,0.0,0.0,4.0,1930.0,0.0,Kent
3,420000.0,3.0,2.25,2000.0,8030.0,1.0,0.0,0.0,4.0,1000.0,1000.0,Bellevue
4,550000.0,4.0,2.5,1940.0,10500.0,1.0,0.0,0.0,4.0,1140.0,800.0,Redmond


In [53]:
new_dataframe = pd.get_dummies(new_dataframe, drop_first=True)
new_dataframe.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,...,city_SeaTac,city_Seattle,city_Shoreline,city_Skykomish,city_Snoqualmie,city_Snoqualmie Pass,city_Tukwila,city_Vashon,city_Woodinville,city_Yarrow Point
0,313000.0,3.0,1.5,1340.0,7912.0,1.5,0.0,0.0,3.0,1340.0,...,False,False,True,False,False,False,False,False,False,False
1,2384000.0,5.0,2.5,3650.0,9050.0,2.0,0.0,4.0,5.0,3370.0,...,False,True,False,False,False,False,False,False,False,False
2,342000.0,3.0,2.0,1930.0,11947.0,1.0,0.0,0.0,4.0,1930.0,...,False,False,False,False,False,False,False,False,False,False
3,420000.0,3.0,2.25,2000.0,8030.0,1.0,0.0,0.0,4.0,1000.0,...,False,False,False,False,False,False,False,False,False,False
4,550000.0,4.0,2.5,1940.0,10500.0,1.0,0.0,0.0,4.0,1140.0,...,False,False,False,False,False,False,False,False,False,False


## Assigning X and y