### Goal
To predict the sales price for each house. For each Id in the test set, you must predict the value of the SalePrice variable. 


### Step 1 - Data Ingestion

In [2]:
import pandas as pd
df = pd.read_csv("training_set.csv", na_values=["","NA"], keep_default_na=False)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Step 2 - Perform Basic Data Quality Checks

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
m = df.isna().sum()
m

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [5]:
m[m>0]


LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(0)

### Step 3 - Seperate X and Y(SalePrice)
Id is Stastically insignificant


In [7]:
X = df.drop(columns=["Id","SalePrice"])
Y = df[["SalePrice"]]

In [8]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [9]:
Y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


### Step 4 - Create a preprocessing pipeline for feature selection
Categorical - Ordinal Encoding

In [10]:
cat = list(X.columns[X.dtypes == "object"])
con = list(X.columns[X.dtypes != "object"])

In [11]:
cat

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [12]:
con

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

### Below pipeline is used for feature selection

In [13]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [14]:
num_pipe1 = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

In [15]:
cat_pipe1 = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder()
)

In [16]:
pre1 = ColumnTransformer(
    [
        ("num", num_pipe1, con),
        ("cat", cat_pipe1, cat)
    ]
).set_output(transform="pandas")

In [17]:
X_pre = pre1.fit_transform(X)
X_pre.head()

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,cat__GarageType,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,0.073375,-0.229372,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.511418,0.575425,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
1,-0.872563,0.451936,-0.091886,-0.071836,2.179628,0.156734,-0.429577,-0.57441,1.171992,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
2,0.073375,-0.09311,0.07348,0.651479,-0.5172,0.984752,0.830215,0.32306,0.092907,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0
3,0.309859,-0.456474,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.57441,-0.499274,-0.288653,...,5.0,2.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,0.0
4,0.073375,0.633618,0.375148,1.374795,-0.5172,0.951632,0.733308,1.36457,0.463568,-0.288653,...,1.0,1.0,4.0,4.0,2.0,2.0,2.0,2.0,8.0,4.0


### Apply Feature Selection

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

In [19]:
base_model = LinearRegression()

sel = SequentialFeatureSelector(base_model, direction="forward", n_features_to_select = "auto")

sel.fit(X_pre, Y)

In [20]:
imp_cols = sel.get_feature_names_out()
imp_cols

array(['num__MSSubClass', 'num__LotArea', 'num__OverallQual',
       'num__OverallCond', 'num__YearBuilt', 'num__MasVnrArea',
       'num__BsmtFinSF1', 'num__GrLivArea', 'num__BsmtFullBath',
       'num__KitchenAbvGr', 'num__TotRmsAbvGrd', 'num__Fireplaces',
       'num__GarageCars', 'num__WoodDeckSF', 'num__OpenPorchSF',
       'num__ScreenPorch', 'num__PoolArea', 'num__YrSold', 'cat__Street',
       'cat__LandContour', 'cat__Utilities', 'cat__Neighborhood',
       'cat__BldgType', 'cat__HouseStyle', 'cat__RoofStyle',
       'cat__RoofMatl', 'cat__Exterior1st', 'cat__MasVnrType',
       'cat__ExterQual', 'cat__BsmtQual', 'cat__BsmtCond',
       'cat__BsmtExposure', 'cat__HeatingQC', 'cat__KitchenQual',
       'cat__Functional', 'cat__GarageCond', 'cat__PavedDrive',
       'cat__Fence', 'cat__MiscFeature'], dtype=object)

In [21]:
len(imp_cols)

39

In [22]:
sel_cols = [col.split("__")[1] for col in imp_cols]
sel_cols

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'MasVnrArea',
 'BsmtFinSF1',
 'GrLivArea',
 'BsmtFullBath',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'OpenPorchSF',
 'ScreenPorch',
 'PoolArea',
 'YrSold',
 'Street',
 'LandContour',
 'Utilities',
 'Neighborhood',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'MasVnrType',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'KitchenQual',
 'Functional',
 'GarageCond',
 'PavedDrive',
 'Fence',
 'MiscFeature']

In [23]:
X_sel = X[sel_cols]
X_sel

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,MasVnrArea,BsmtFinSF1,GrLivArea,BsmtFullBath,KitchenAbvGr,...,BsmtQual,BsmtCond,BsmtExposure,HeatingQC,KitchenQual,Functional,GarageCond,PavedDrive,Fence,MiscFeature
0,60,8450,7,5,2003,196.0,706,1710,1,1,...,Gd,TA,No,Ex,Gd,Typ,TA,Y,,
1,20,9600,6,8,1976,0.0,978,1262,0,1,...,Gd,TA,Gd,Ex,TA,Typ,TA,Y,,
2,60,11250,7,5,2001,162.0,486,1786,1,1,...,Gd,TA,Mn,Ex,Gd,Typ,TA,Y,,
3,70,9550,7,5,1915,0.0,216,1717,1,1,...,TA,Gd,No,Gd,Gd,Typ,TA,Y,,
4,60,14260,8,5,2000,350.0,655,2198,1,1,...,Gd,TA,Av,Ex,Gd,Typ,TA,Y,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,7917,6,5,1999,0.0,0,1647,0,1,...,Gd,TA,No,Ex,TA,Typ,TA,Y,,
1456,20,13175,6,6,1978,119.0,790,2073,1,1,...,Gd,TA,No,TA,TA,Min1,TA,Y,MnPrv,
1457,70,9042,7,9,1941,0.0,275,2340,0,1,...,TA,Gd,No,Ex,Gd,Typ,TA,Y,GdPrv,Shed
1458,20,9717,5,6,1950,0.0,49,1078,1,1,...,TA,TA,Mn,Gd,Gd,Typ,TA,Y,,


### Feature Selection is Done

### Create a final preprocessing pipeline on X_sel
Categorical - OneHotEncoding

In [24]:
cat_sel = list(X_sel.columns[X_sel.dtypes == "object"])
con_sel = list(X_sel.columns[X_sel.dtypes != "object"])

In [25]:
cat_sel

['Street',
 'LandContour',
 'Utilities',
 'Neighborhood',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'MasVnrType',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'KitchenQual',
 'Functional',
 'GarageCond',
 'PavedDrive',
 'Fence',
 'MiscFeature']

In [26]:
con_sel

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'MasVnrArea',
 'BsmtFinSF1',
 'GrLivArea',
 'BsmtFullBath',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'OpenPorchSF',
 'ScreenPorch',
 'PoolArea',
 'YrSold']

In [27]:
from sklearn.preprocessing import OneHotEncoder

In [28]:
num_pipe2 = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

In [29]:
cat_pipe2 = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)
)

In [30]:
pre2 = ColumnTransformer(
    [
        ("num", num_pipe2, con_sel),
        ("cat", cat_pipe2, cat_sel)
    ]
).set_output(transform="pandas")

In [31]:
x_sel_pre = pre2.fit_transform(X_sel)
x_sel_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtFinSF1,num__GrLivArea,num__BsmtFullBath,num__KitchenAbvGr,...,cat__PavedDrive_P,cat__PavedDrive_Y,cat__Fence_GdPrv,cat__Fence_GdWo,cat__Fence_MnPrv,cat__Fence_MnWw,cat__MiscFeature_Gar2,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC
0,0.073375,-0.207142,0.651479,-0.5172,1.050994,0.511418,0.575425,0.370333,1.10781,-0.211454,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,-0.872563,-0.091886,-0.071836,2.179628,0.156734,-0.57441,1.171992,-0.482512,-0.819964,-0.211454,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.073375,0.07348,0.651479,-0.5172,0.984752,0.32306,0.092907,0.515013,1.10781,-0.211454,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.309859,-0.096897,0.651479,-0.5172,-1.863632,-0.57441,-0.499274,0.383659,1.10781,-0.211454,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.073375,0.375148,1.374795,-0.5172,0.951632,1.36457,0.463568,1.299326,1.10781,-0.211454,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


### Step 5 - Apply Train test split

In [32]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x_sel_pre, Y, test_size=0.2, random_state=21)

In [33]:
xtrain.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtFinSF1,num__GrLivArea,num__BsmtFullBath,num__KitchenAbvGr,...,cat__PavedDrive_P,cat__PavedDrive_Y,cat__Fence_GdPrv,cat__Fence_GdWo,cat__Fence_MnPrv,cat__Fence_MnWw,cat__MiscFeature_Gar2,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC
710,-0.636078,-0.640101,-2.241782,0.381743,-1.201217,-0.57441,-0.973018,-1.497169,-0.819964,-0.211454,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1098,-0.163109,-0.452686,-1.518467,0.381743,-1.168096,-0.57441,0.500854,-0.364484,-0.819964,-0.211454,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1286,-0.872563,-0.072844,-0.071836,-0.5172,-0.273836,1.924104,0.274948,-0.35687,1.10781,-0.211454,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
992,0.073375,-0.075851,-0.071836,2.179628,-0.240715,1.30917,0.20257,0.638751,-0.819964,-0.211454,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
631,1.492282,-0.593999,1.374795,-0.5172,1.150356,0.023903,-0.92038,0.073361,-0.819964,-0.211454,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [34]:
ytrain.head()

Unnamed: 0,SalePrice
710,52000
1098,128000
1286,143000
992,187000
631,209500


In [35]:
xtest.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__MasVnrArea,num__BsmtFinSF1,num__GrLivArea,num__BsmtFullBath,num__KitchenAbvGr,...,cat__PavedDrive_P,cat__PavedDrive_Y,cat__Fence_GdPrv,cat__Fence_GdWo,cat__Fence_MnPrv,cat__Fence_MnWw,cat__MiscFeature_Gar2,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC
880,-0.872563,-0.350058,-0.795151,-0.5172,1.117235,-0.57441,1.176379,-0.809944,1.10781,-0.211454,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
605,0.073375,0.309002,0.651479,0.381743,-0.207594,0.40062,0.022723,0.895747,-0.819964,-0.211454,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1166,-0.872563,-0.004192,1.374795,-0.5172,1.216598,-0.175535,-0.973018,0.339875,-0.819964,-0.211454,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
216,-0.872563,-0.207142,0.651479,-0.5172,1.084115,0.899214,1.101808,-0.151273,1.10781,-0.211454,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
970,-0.163109,0.02838,-1.518467,-1.416142,-0.737526,-0.57441,-0.973018,-0.615769,-0.819964,-0.211454,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [36]:
ytest.head()

Unnamed: 0,SalePrice
880,157000
605,205000
1166,245350
216,210000
970,135000


In [37]:
xtrain.shape

(1168, 145)

In [38]:
xtest.shape

(292, 145)

### Step 6 - Model Building
#### 1. Linear Regression

In [39]:
model1 = LinearRegression()
model1.fit(xtrain, ytrain)

In [40]:
model1.intercept_

array([36782.0424289])

In [41]:
model1.coef_

array([[-2.74787534e+03,  5.21667058e+03,  8.69866560e+03,
         6.33000306e+03,  1.39863531e+04,  3.72157538e+03,
         8.26648870e+03,  3.77434696e+04,  1.08022318e+03,
        -2.53103175e+03,  9.01057054e+02,  7.14536812e+02,
         4.10812435e+03,  1.33865753e+03,  1.97862879e+03,
         2.73135980e+03,  4.81275325e+03, -1.97553785e+02,
        -2.18693139e+04,  2.18693139e+04, -2.44963343e+03,
         5.80254289e+03, -4.48269015e+03,  1.12978070e+03,
         1.64632814e+04, -1.64632814e+04,  6.50615261e+03,
         2.77312083e+03,  6.76129958e+03,  1.65343027e+03,
        -1.00489722e+04, -3.81117142e+03,  1.27098062e+04,
        -1.11329240e+04, -7.97194056e+03, -6.11255361e+03,
        -8.53761237e+03, -1.54942970e+04, -1.30218451e+04,
         1.10793822e+04, -1.43507687e+04,  3.00862207e+04,
         2.04159266e+04, -7.46876976e+03, -1.06046920e+04,
        -8.04602690e+03, -4.90121709e+03,  7.95008858e+03,
         3.01610455e+04, -9.50073977e+03,  9.07057535e+0

In [42]:
model1.score(xtrain, ytrain)

0.9255777970832799

In [43]:
model1.score(xtest, ytest)

0.8011814786173144

#### 2. Ridge Regression

In [44]:
from sklearn.linear_model import Ridge

model2 = Ridge(alpha = 1)

model2.fit(xtrain, ytrain)


In [45]:
model2.score(xtrain, ytrain)

0.9124763278467363

In [46]:
model2.score(xtest, ytest)

0.8366285003140893

#### Hyperparameter Tuning

In [47]:
import numpy as np 

params = {
    "alpha" : np.arange(start=0.1, stop=100, step = 0.1)
}

params

{'alpha': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ,  1.1,
         1.2,  1.3,  1.4,  1.5,  1.6,  1.7,  1.8,  1.9,  2. ,  2.1,  2.2,
         2.3,  2.4,  2.5,  2.6,  2.7,  2.8,  2.9,  3. ,  3.1,  3.2,  3.3,
         3.4,  3.5,  3.6,  3.7,  3.8,  3.9,  4. ,  4.1,  4.2,  4.3,  4.4,
         4.5,  4.6,  4.7,  4.8,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.5,
         5.6,  5.7,  5.8,  5.9,  6. ,  6.1,  6.2,  6.3,  6.4,  6.5,  6.6,
         6.7,  6.8,  6.9,  7. ,  7.1,  7.2,  7.3,  7.4,  7.5,  7.6,  7.7,
         7.8,  7.9,  8. ,  8.1,  8.2,  8.3,  8.4,  8.5,  8.6,  8.7,  8.8,
         8.9,  9. ,  9.1,  9.2,  9.3,  9.4,  9.5,  9.6,  9.7,  9.8,  9.9,
        10. , 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7, 10.8, 10.9, 11. ,
        11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8, 11.9, 12. , 12.1,
        12.2, 12.3, 12.4, 12.5, 12.6, 12.7, 12.8, 12.9, 13. , 13.1, 13.2,
        13.3, 13.4, 13.5, 13.6, 13.7, 13.8, 13.9, 14. , 14.1, 14.2, 14.3,
        14.4, 14.5, 14.6, 14.

In [48]:
from sklearn.model_selection import GridSearchCV

base_ridge = Ridge()

gscv_ridge = GridSearchCV(base_ridge, param_grid=params, cv = 5, scoring="r2", n_jobs = -1)

gscv_ridge.fit(xtrain, ytrain)

In [49]:
gscv_ridge.best_params_

{'alpha': np.float64(21.1)}

In [50]:
gscv_ridge.best_score_

np.float64(0.8462959370086441)

In [51]:
best_ridge = gscv_ridge.best_estimator_

best_ridge

In [52]:
best_ridge.score(xtrain, ytrain)

0.8835571237174773

In [53]:
best_ridge.score(xtest, ytest)

0.830584096427463

#### 3. Lasso Regression

In [54]:
from sklearn.linear_model import Lasso

model3 = Lasso(alpha=1)

model3.fit(xtrain, ytrain)

In [55]:
model3.score(xtrain, ytrain)

0.9255662573162222

In [56]:
model3.score(xtest, ytest)

0.8056977677576639

#### Hyperparameter tuning for lasso model

In [57]:
base_lasso = Lasso()

gscv_lasso = GridSearchCV(base_lasso, param_grid = params, cv=5,scoring = "r2",n_jobs=-1)

gscv_lasso.fit(xtrain, ytrain)

In [58]:
gscv_lasso.best_params_

{'alpha': np.float64(99.9)}

In [59]:
gscv_lasso.best_score_

np.float64(0.8543782476496858)

In [60]:
best_lasso = gscv_lasso.best_estimator_

In [61]:
best_lasso

In [62]:
best_lasso.score(xtrain, ytrain)

0.9158752668385003

In [63]:
best_lasso.score(xtest, ytest)

0.8224697279016915

### Step 7 - Model evaluation

In [64]:
from sklearn.metrics import(
    root_mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    r2_score
)

In [65]:
def evaluate_model(model, x, y):
    ypred = model.predict(x)
    rmse = root_mean_squared_error(y, ypred)
    mae = mean_absolute_error(y, ypred)
    mape = mean_absolute_percentage_error(y, ypred)
    r2 = r2_score(y, ypred)
    print(f"RMSE : {rmse:.2f}")
    print(f"MAE : {mae:.2f}")
    print(f"MAPE : {mape:.2%}")
    print(f"R2 Score : {r2:.2%}")

In [66]:
print("Linear Regression Train Results")
evaluate_model(model1, xtrain, ytrain)

Linear Regression Train Results
RMSE : 21461.44
MAE : 14608.24
MAPE : 8.79%
R2 Score : 92.56%


In [67]:
print("Linear Regression Test Results")
evaluate_model(model1, xtest, ytest)

Linear Regression Test Results
RMSE : 36707.70
MAE : 18386.15
MAPE : 10.72%
R2 Score : 80.12%


In [68]:
print("Ridge Train Results")
evaluate_model(best_ridge, xtrain, ytrain)

Ridge Train Results
RMSE : 26845.04
MAE : 16110.13
MAPE : 9.55%
R2 Score : 88.36%


In [69]:
print("Ridge Test Results")
evaluate_model(best_ridge, xtest, ytest)

Ridge Test Results
RMSE : 33884.87
MAE : 17665.46
MAPE : 10.12%
R2 Score : 83.06%


In [70]:
print("Lasso Train Results")
evaluate_model(best_lasso, xtrain, ytrain)

Lasso Train Results
RMSE : 22817.57
MAE : 15384.13
MAPE : 9.20%
R2 Score : 91.59%


In [71]:
print("Lasso Test Results")
evaluate_model(best_lasso, xtest, ytest)

Lasso Test Results
RMSE : 34686.86
MAE : 17493.73
MAPE : 10.08%
R2 Score : 82.25%


### From Above Results, use The Ridge model as it has highest R2 score of 83.06