# Regularization

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings 
warnings.filterwarnings("ignore")

In [2]:
cars = pd.read_csv("cars.csv")

In [3]:
cars.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [4]:
cars["normalized-losses"].replace("?",np.nan,inplace=True)
cars["normalized-losses"] = cars["normalized-losses"].astype("float")
cars["normalized-losses"].fillna(cars["normalized-losses"].mean(),inplace=True)

cars["horsepower"].replace("?",np.nan,inplace=True)
cars["horsepower"] = cars["horsepower"].astype("float")
cars["horsepower"].fillna(cars["horsepower"].mean(),inplace=True)

cars.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27,13495
1,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27,16500
2,1,122.0,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154.0,19,26,16500
3,2,164.0,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102.0,24,30,13950
4,2,164.0,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115.0,18,22,17450


In [5]:
cars_cat = cars.select_dtypes("object")

cars_num = cars.select_dtypes(["int64","float64"])

In [6]:
cars_num.head()

Unnamed: 0,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,122.0,64.1,48.8,130,111.0,21,27,13495
1,3,122.0,64.1,48.8,130,111.0,21,27,16500
2,1,122.0,65.5,52.4,152,154.0,19,26,16500
3,2,164.0,66.2,54.3,109,102.0,24,30,13950
4,2,164.0,66.4,54.3,136,115.0,18,22,17450


In [8]:
cars_cat.head()

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,alfa-romero,gas,convertible,rwd,front,dohc
1,alfa-romero,gas,convertible,rwd,front,dohc
2,alfa-romero,gas,hatchback,rwd,front,ohcv
3,audi,gas,sedan,fwd,front,ohc
4,audi,gas,sedan,4wd,front,ohc


In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
le = LabelEncoder()

In [11]:
for col in cars_cat:
    le = LabelEncoder()
    cars_cat[col] = le.fit_transform(cars_cat[col])

In [12]:
cars_cat.head()

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,0,1,0,2,0,0
1,0,1,0,2,0,0
2,0,1,2,2,0,5
3,1,1,3,1,0,3
4,1,1,3,0,0,3


In [13]:
df_new = pd.concat([cars_num,cars_cat],axis = 1)

In [14]:
df_new.head()

Unnamed: 0,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg,price,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,3,122.0,64.1,48.8,130,111.0,21,27,13495,0,1,0,2,0,0
1,3,122.0,64.1,48.8,130,111.0,21,27,16500,0,1,0,2,0,0
2,1,122.0,65.5,52.4,152,154.0,19,26,16500,0,1,2,2,0,5
3,2,164.0,66.2,54.3,109,102.0,24,30,13950,1,1,3,1,0,3
4,2,164.0,66.4,54.3,136,115.0,18,22,17450,1,1,3,0,0,3


# Baseline Model

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [18]:
x = df_new.drop("price",axis=1)
y = df_new["price"]

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=1)

In [20]:
lr = LinearRegression()
lr.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [23]:
#Train
lr.score(x_train,y_train)              #R2 score

0.8504573774895474

In [24]:
# Test
lr.score(x_test,y_test)

0.796556678039738

In [26]:
## so above is scenario of overfitting ## since Training score < Testing score

# Ridge and Lasso Regularization

In [32]:
from sklearn.linear_model import Lasso  # lambda * sum(abs(coef))
from sklearn.linear_model import Ridge  # lambda * sum(square(coef))

In [33]:
lr.coef_

array([ 4.51384957e+01,  1.53127607e+00,  7.89452171e+02,  3.62663990e+02,
        9.83682875e+01, -1.08169245e+01,  3.08017854e+02, -4.17024371e+02,
       -2.00099087e+02, -6.22650015e+02, -1.70235175e+02,  1.86860719e+03,
        1.64133620e+04,  2.83174279e+02])

In [35]:
l2 = Ridge(10)     # lambda (hyper parameter) = 10

In [36]:
l2.fit(x_train,y_train)

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

In [37]:
l2.coef_

array([ 2.08658930e+02, -5.60173023e-01,  3.64420144e+02,  5.72916414e+02,
        1.04441215e+02,  2.21332730e+01,  2.11271281e+02, -2.72864381e+02,
       -1.86340249e+02, -9.06610516e+02, -6.30655861e+02,  1.56860422e+03,
        2.57047785e+03,  5.15948757e+02])

In [39]:
l1 = Lasso(1000)
l1.fit(x_train,y_train)

Lasso(alpha=1000, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [41]:
l1.coef_
#Feature selection

array([   0.        ,    3.96310671,    0.        ,  276.93279232,
        135.34169218,   16.00129073,   -0.        , -113.03311963,
       -126.40695244,   -0.        ,   -0.        ,    0.        ,
          0.        ,    0.        ])

In [43]:
# how do we choose right value of lambda (hyper parameter)

In [44]:
for i in range(50):
    l2 = Ridge(i)
    l2.fit(x_train,y_train)
    print(l2.score(x_test,y_test))

0.7965566780397382
0.8074518758147277
0.8110292248150512
0.8126933383890032
0.8136148645029302
0.8141745853539419
0.8145301242133359
0.8147582608502814
0.8149010602831952
0.814983694925305
0.8150222867376524
0.8150277245431793
0.8150076788279419
0.8149677381788262
0.8149120868051175
0.8148439278252517
0.814765758494739
0.8146795554128128
0.8145869029046833
0.8144890843369584
0.8143871485445551
0.8142819591129825
0.8141742315788492
0.8140645619421536
0.8139534488166059
0.8138413108452037
0.8137285005403768
0.8136153153884843
0.8135020068362637
0.8133887876197213
0.8132758377831448
0.813163309653795
0.8130513319772293
0.8129400133729978
0.8128294452363019
0.8127197041851854
0.8126108541327796
0.812502948048576
0.8123960294605083
0.8122901337400303
0.8121852892047277
0.8120815180669056
0.8119788372516914
0.8118772591041921
0.8117767920020494
0.8116774408870528
0.811579207727327
0.8114820919198034
0.8113860906412016
0.8112911991545092


In [45]:
for i in range(200,500,50):
    l1 = Lasso(i)
    l1.fit(x_train,y_train)
    print(l1.score(x_test,y_test))

0.813920135802378
0.812421909078023
0.8085057299003378
0.8036053753129062
0.7977229768452245
0.7950465607641612


## Final Models

<p>Ridge with lambda 2 score - 0.81 </p>
<p>Lasso with lambda 200 score - 0.81</p>

# Cross Validation

In [47]:
# it divide data in some n number of part and it will perform the Ridge and Lasso regularization on each of those part 
# that will gives us clear conclusion that which model is working perfectlt on overall data

In [50]:
from sklearn.model_selection import cross_val_score

In [51]:
l1 = Ridge(2)
l2 = Lasso(200)

In [52]:
l1_cross = cross_val_score(l1,x,y,cv=4)

In [53]:
l1_cross

array([0.71176474, 0.86474228, 0.37640664, 0.47020196])

In [54]:
np.mean(l1_cross)

0.6057789059244738

In [55]:
l2_cross = cross_val_score(l2,x,y,cv=4)

In [56]:
l2_cross

array([0.76560829, 0.81872367, 0.43344753, 0.448364  ])

In [57]:
np.mean(l2_cross)

0.6165358705430694