In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [2]:
df = sns.load_dataset('mpg')

In [3]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [4]:
df.tail()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
393,27.0,4,140.0,86.0,2790,15.6,82,usa,ford mustang gl
394,44.0,4,97.0,52.0,2130,24.6,82,europe,vw pickup
395,32.0,4,135.0,84.0,2295,11.6,82,usa,dodge rampage
396,28.0,4,120.0,79.0,2625,18.6,82,usa,ford ranger
397,31.0,4,119.0,82.0,2720,19.4,82,usa,chevy s-10


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [6]:
df.shape

(398, 9)

In [7]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,398.0,398.0,398.0,392.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,104.469388,2970.424623,15.56809,76.01005
std,7.815984,1.701004,104.269838,38.49116,846.841774,2.757689,3.697627
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.5,4.0,104.25,75.0,2223.75,13.825,73.0
50%,23.0,4.0,148.5,93.5,2803.5,15.5,76.0
75%,29.0,8.0,262.0,126.0,3608.0,17.175,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [8]:
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64

In [9]:
value = df['horsepower'].median() # since we haven't did outlier treatement

In [10]:
df['horsepower'].fillna(value , inplace = True)

In [11]:
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
name            0
dtype: int64

In [12]:
df['origin'].value_counts()

origin
usa       249
japan      79
europe     70
Name: count, dtype: int64

In [13]:
df['origin'].replace(['usa','japan','europe'] , ['1','2','3'], inplace = True)

In [14]:
df['origin'].value_counts()

origin
1    249
2     79
3     70
Name: count, dtype: int64

In [15]:
df['origin'] = df['origin'].astype(int)

In [16]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin            int32
name             object
dtype: object

In [17]:
df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin', 'name'],
      dtype='object')

In [18]:
# as it is identity type of column it will not help in learning the pattern
df.drop('name', axis = 1 , inplace = True)

In [19]:
df.shape

(398, 8)

In [20]:
x = df.drop('mpg' , axis = 1)
y = df['mpg']

In [21]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = 0.2)

In [22]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model

In [23]:
model.fit(x_train,y_train)

In [24]:
model.coef_

array([-0.36138146,  0.02174714, -0.00367576, -0.00735229,  0.13925132,
        0.83093737,  1.447414  ])

In [25]:
for i , col in enumerate(x_train.columns):
    print(f"for {col} the coefficient value is {model.coef_[i]:.5f}")

for cylinders the coefficient value is -0.36138
for displacement the coefficient value is 0.02175
for horsepower the coefficient value is -0.00368
for weight the coefficient value is -0.00735
for acceleration the coefficient value is 0.13925
for model_year the coefficient value is 0.83094
for origin the coefficient value is 1.44741


In [26]:
# obervation is -> coeff are very small .. if any one coef changes there will be almost no change this is called as smoother model

In [27]:
print(f"the train accuracy of our model is : {model.score(x_train,y_train)*100:.2f}%")

the train accuracy of our model is : 82.20%


In [28]:
print(f"the train accuracy of our model is : {model.score(x_test,y_test)*100:.2f}%")
y_pred = model.predict(x_test)

the train accuracy of our model is : 81.07%


In [29]:
from sklearn.metrics import r2_score
print(f"r2 for our model is {r2_score(y_test,y_pred)*100:.2f}")

r2 for our model is 81.07


In [30]:
 # implementing in regularized models
# 1-> ridge
from sklearn.linear_model import Ridge
# we learnt about something called as lemda here it is named as alpha
ridge_model = Ridge(alpha=0.5)
ridge_model

In [31]:
ridge_model.fit(x_train,y_train)

In [32]:
for i , col in enumerate(x_train.columns):
    print(f"for {col} the coefficient value if ridge model is {ridge_model.coef_[i]:.5f}")

for cylinders the coefficient value if ridge model is -0.35884
for displacement the coefficient value if ridge model is 0.02165
for horsepower the coefficient value if ridge model is -0.00362
for weight the coefficient value if ridge model is -0.00735
for acceleration the coefficient value if ridge model is 0.13924
for model_year the coefficient value if ridge model is 0.83062
for origin the coefficient value if ridge model is 1.44069


In [33]:
y_pred_ridge = ridge_model.predict(x_test)
print(f"r2 for our ridge_model is {r2_score(y_test,y_pred_ridge)*100:.2f}")

r2 for our ridge_model is 81.08


In [86]:
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha = 0.5)
lasso_model

In [88]:
lasso_model.fit(x_train,y_train)

In [109]:
for i , col in enumerate(x_train.columns):
    print(f"for {col} the coefficient value if lasso model is {lasso_model.coef_[i]:.5f}")

for cylinders the coefficient value if lasso model is -0.00000
for displacement the coefficient value if lasso model is 0.00137
for horsepower the coefficient value if lasso model is -0.00297
for weight the coefficient value if lasso model is -0.00668
for acceleration the coefficient value if lasso model is 0.00000
for model_year the coefficient value if lasso model is 0.73905
for origin the coefficient value if lasso model is 0.00000


In [111]:
#  the 3 least important features are removed by lasso 

In [113]:
y_pred_lasso = ridge_model.predict(x_test)
print(f"r2 for our lasso_model is {r2_score(y_test,y_pred_lasso)*100:.2f}")

r2 for our lasso_model is 81.08


In [115]:
from sklearn.linear_model import ElasticNet
elastic_model = ElasticNet(alpha = 1 , l1_ratio = 0.5)
elastic_model

In [117]:
elastic_model.fit(x_train ,y_train)

In [131]:
y_pred_elastic_net = elastic_model.predict(x_test)
print(f"r2 for our elastic_net_model is {r2_score(y_test,y_pred_elastic_net)*100:.2f}")

r2 for our elastic_net_model is 80.50


In [133]:
for i , col in enumerate(x_train.columns):
    print(f"for {col} the coefficient value if elastic net model is {elastic_model.coef_[i]:.5f}")


for cylinders the coefficient value if elastic net model is -0.00000
for displacement the coefficient value if elastic net model is 0.00106
for horsepower the coefficient value if elastic net model is -0.00466
for weight the coefficient value if elastic net model is -0.00663
for acceleration the coefficient value if elastic net model is 0.00000
for model_year the coefficient value if elastic net model is 0.70494
for origin the coefficient value if elastic net model is 0.00000


In [155]:
#  regularization with lasso
from sklearn.linear_model import LassoCV
lasso_model_CV = LassoCV(cv =  5, verbose=2)
lasso_model_CV.fit(x_train,y_train)

Path: 000 out of 100
Path: 001 out of 100
Path: 002 out of 100
Path: 003 out of 100
Path: 004 out of 100
Path: 005 out of 100
Path: 006 out of 100
Path: 007 out of 100
Path: 008 out of 100
Path: 009 out of 100
Path: 010 out of 100
Path: 011 out of 100
Path: 012 out of 100
Path: 013 out of 100
Path: 014 out of 100
Path: 015 out of 100
Path: 016 out of 100
Path: 017 out of 100
Path: 018 out of 100
Path: 019 out of 100
Path: 020 out of 100
Path: 021 out of 100
Path: 022 out of 100
Path: 023 out of 100
Path: 024 out of 100
Path: 025 out of 100
Path: 026 out of 100
Path: 027 out of 100
Path: 028 out of 100
Path: 029 out of 100
Path: 030 out of 100
Path: 031 out of 100
Path: 032 out of 100
Path: 033 out of 100
Path: 034 out of 100
Path: 035 out of 100
Path: 036 out of 100
Path: 037 out of 100
Path: 038 out of 100
Path: 039 out of 100
Path: 040 out of 100
Path: 041 out of 100
Path: 042 out of 100
Path: 043 out of 100
Path: 044 out of 100
Path: 045 out of 100
Path: 046 out of 100
Path: 047 out

In [141]:
for i , col in enumerate(x_train.columns):
    print(f"for {col} the coefficient value if elastic net model {lasso_model_CV.coef_[i]:.5f}")
    

for cylinders the coefficient value if elastic net model -0.00000
for displacement the coefficient value if elastic net model -0.00261
for horsepower the coefficient value if elastic net model -0.01362
for weight the coefficient value if elastic net model -0.00645
for acceleration the coefficient value if elastic net model 0.00000
for model_year the coefficient value if elastic net model 0.26296
for origin the coefficient value if elastic net model 0.00000
r2 for our lasso_model_CV is 80.50


In [143]:
y_pred_lasso_cv = lasso_model_CV.predict(x_test)
print(f"r2 for our lasso_model_CV is {r2_score(y_test,y_pred_lasso_cv)*100:.2f}")

r2 for our lasso_model_CV is 76.20


In [147]:
#  regularization with lasso
from sklearn.linear_model import RidgeCV
ridge_model_CV = RidgeCV(cv =  5)
ridge_model_CV.fit(x_train,y_train)

for i , col in enumerate(x_train.columns):
    print(f"for {col} the coefficient value if elastic net model {ridge_model_CV.coef_[i]:.5f}")
    

for cylinders the coefficient value if elastic net model -0.31632
for displacement the coefficient value if elastic net model 0.02001
for horsepower the coefficient value if elastic net model -0.00272
for weight the coefficient value if elastic net model -0.00734
for acceleration the coefficient value if elastic net model 0.13880
for model_year the coefficient value if elastic net model 0.82499
for origin the coefficient value if elastic net model 1.32377
r2 for our ridge_model_CV is 81.11


In [149]:
y_pred_ridge_cv = ridge_model_CV.predict(x_test)
print(f"r2 for our ridge_model_CV is {r2_score(y_test,y_pred_ridge_cv)*100:.2f}")

r2 for our ridge_model_CV is 81.11


In [157]:
#  regularization with lasso
from sklearn.linear_model import ElasticNetCV
elastic_net_model_CV = ElasticNetCV(cv =  5)
elastic_net_model_CV.fit(x_train,y_train)

for i , col in enumerate(x_train.columns):
    print(f"for {col} the coefficient value if elastic net model {elastic_net_model_CV.coef_[i]:.5f}")
    

for cylinders the coefficient value if elastic net model -0.00000
for displacement the coefficient value if elastic net model -0.00353
for horsepower the coefficient value if elastic net model -0.01798
for weight the coefficient value if elastic net model -0.00630
for acceleration the coefficient value if elastic net model 0.00000
for model_year the coefficient value if elastic net model 0.16944
for origin the coefficient value if elastic net model 0.00000


In [159]:
y_pred_elastic_cv = ridge_model_CV.predict(x_test)
print(f"r2 for our ridge_model_CV is {r2_score(y_test,y_pred_elastic_cv)*100:.2f}")

r2 for our ridge_model_CV is 81.11
