The "mpg" dataset

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load Dataset
data = sns.load_dataset("mpg")

In [3]:
# target features is mpg no need name features
data.head(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320


In [4]:
data.drop(columns=["name"],inplace=True)
data.head(2)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [6]:
# check Null Value
data.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [7]:
# Since we have not done outlier treatment, then better idea would be to replace the missing value with median
data["horsepower"].median()

93.5

In [8]:
data["horsepower"].fillna(data["horsepower"].median(),inplace=True)

In [9]:
data.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [10]:
data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin           object
dtype: object

In [11]:
data["origin"].value_counts()

usa       249
japan      79
europe     70
Name: origin, dtype: int64

In [12]:
# There is Object Data Feactures
data["origin"] = data["origin"].map({"usa":1,"japan":2,"europe":3})

In [13]:
data.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin            int64
dtype: object

In [14]:
# Separate into X and Y
X = data.drop(columns=["mpg"])
y = data["mpg"]

In [15]:
X.head(2)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,8,307.0,130.0,3504,12.0,70,1
1,8,350.0,165.0,3693,11.5,70,1


In [16]:
y.head()

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [18]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(318, 7)
(80, 7)
(318,)
(80,)


In [19]:
# Simple Linear Regression Model
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
regression_model

LinearRegression()

In [20]:
# Train the model
regression_model.fit(X_train,y_train)

LinearRegression()

In [21]:
regression_model.coef_

array([-0.2165378 ,  0.01987994, -0.01270482, -0.0071647 ,  0.09047637,
        0.84604207,  1.42800007])

In [22]:
# enumerate() combines both the index and the element into one iterator.
for i, col in enumerate(X_train.columns):
    print(f"{col} : {regression_model.coef_[i]}")


cylinders : -0.21653780216832877
displacement : 0.019879936208875095
horsepower : -0.012704816241841353
weight : -0.007164698157663284
acceleration : 0.09047636715999383
model_year : 0.8460420672591435
origin : 1.4280000651600915


In [23]:
# Coefficient are relatively smaller, if one independent variable changes
# There will be not much differences inprediction
# This is sometimes called as smoother model

# These features might not be contributiing in model traing

In [24]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

y_pred_linear = regression_model.predict(X_test)

print(f"R2 Score : {r2_score(y_test,y_pred_linear)}")
print(f"MAE : {mean_absolute_error(y_test,y_pred_linear)}")
print(f"MSE : {mean_squared_error(y_test,y_pred_linear)}")

R2 Score : 0.8563012451456409
MAE : 2.2040552125486697
MSE : 8.0961492850645


#### **Ridge Regression**

In [25]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=0.1)
ridge_model

Ridge(alpha=0.1)

In [26]:
ridge_model.fit(X_train,y_train)

Ridge(alpha=0.1)

In [27]:
# enumerate() combines both the index and the element into one iterator.
for i, col in enumerate(X_train.columns):
    print(f"{col} : {ridge_model.coef_[i]}")


cylinders : -0.21615305026026044
displacement : 0.01986221581111451
horsepower : -0.012697324116592825
weight : -0.007164428584491189
acceleration : 0.09045740164184239
model_year : 0.845974707406231
origin : 1.426669100478145


In [28]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

y_pred_linear = ridge_model.predict(X_test)

print(f"R2 Score : {r2_score(y_test,y_pred_linear)}")
print(f"MAE : {mean_absolute_error(y_test,y_pred_linear)}")
print(f"MSE : {mean_squared_error(y_test,y_pred_linear)}")

R2 Score : 0.8563026607885914
MAE : 2.2039603875258953
MSE : 8.096069526150282


In [29]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

y_pred_linear = regression_model.predict(X_test)

print(f"R2 Score : {r2_score(y_test,y_pred_linear)}")
print(f"MAE : {mean_absolute_error(y_test,y_pred_linear)}")
print(f"MSE : {mean_squared_error(y_test,y_pred_linear)}")

R2 Score : 0.8563012451456409
MAE : 2.2040552125486697
MSE : 8.0961492850645


In [30]:
# We don't see much variation in coeff of ridge regression as compared to linear regression

#### **Lasso Regression**

In [31]:
from sklearn.linear_model import Lasso

lasso_model = Lasso(alpha=0.5)
lasso_model

Lasso(alpha=0.5)

In [32]:
lasso_model.fit(X_train,y_train)

Lasso(alpha=0.5)

In [33]:
# enumerate() combines both the index and the element into one iterator.
for i, col in enumerate(X_train.columns):
    print(f"{col} : {lasso_model.coef_[i]}")


# Three features coefficients is 0, lasso helps in feature selection

cylinders : -0.0
displacement : 0.0019772716551741043
horsepower : -0.009291692111718762
weight : -0.006589928565464688
acceleration : 0.0
model_year : 0.7532080062488491
origin : 0.0


In [34]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

y_pred_linear = lasso_model.predict(X_test)

print(f"R2 Score : {r2_score(y_test,y_pred_linear)}")
print(f"MAE : {mean_absolute_error(y_test,y_pred_linear)}")
print(f"MSE : {mean_squared_error(y_test,y_pred_linear)}")

R2 Score : 0.8457765614927952
MAE : 2.2267896108555623
MSE : 8.689121785890116


In [35]:
# enumerate() combines both the index and the element into one iterator.
for i, col in enumerate(X_train.columns):
    print(f"{col} : {ridge_model.coef_[i]}")


cylinders : -0.21615305026026044
displacement : 0.01986221581111451
horsepower : -0.012697324116592825
weight : -0.007164428584491189
acceleration : 0.09045740164184239
model_year : 0.845974707406231
origin : 1.426669100478145


##### **Elastic Net Regression**

In [43]:
from sklearn.linear_model import ElasticNet

elastic_model = ElasticNet(alpha=1,l1_ratio=0.5)
elastic_model

ElasticNet(alpha=1)

In [48]:
elastic_model.fit(X_train,y_train)

ElasticNet(alpha=1)

In [47]:
# enumerate() combines both the index and the element into one iterator.
for i, col in enumerate(X_train.columns):
    print(f"{col} : {elastic_model.coef_[i]}")


cylinders : -0.0
displacement : 0.0015384959190967644
horsepower : -0.01064195700207186
weight : -0.006532608594610211
acceleration : 0.0
model_year : 0.7203541472156387
origin : 0.0


In [45]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

y_pred_linear = elastic_model.predict(X_test)

print(f"R2 Score : {r2_score(y_test,y_pred_linear)}")
print(f"MAE : {mean_absolute_error(y_test,y_pred_linear)}")
print(f"MSE : {mean_squared_error(y_test,y_pred_linear)}")

R2 Score : 0.8477301010951965
MAE : 2.2011646588506513
MSE : 8.579057170011176


##### **Regularisation with Crossvalidation**

In [59]:
from sklearn.linear_model import LassoCV
# LassoCV, ElasticNetCV, RidgeCV It is way to building model here each regression technique internaly take care of cross validataion

Lass_CV_model = LassoCV(cv = 5, verbose=1.4)
Lass_CV_model

LassoCV(cv=5, verbose=1.4)

In [60]:
Lass_CV_model.fit(X_train,y_train)

Path: 000 out of 100
Path: 001 out of 100
Path: 002 out of 100
Path: 003 out of 100
Path: 004 out of 100
Path: 005 out of 100
Path: 006 out of 100
Path: 007 out of 100
Path: 008 out of 100
Path: 009 out of 100
Path: 010 out of 100
Path: 011 out of 100
Path: 012 out of 100
Path: 013 out of 100
Path: 014 out of 100
Path: 015 out of 100
Path: 016 out of 100
Path: 017 out of 100
Path: 018 out of 100
Path: 019 out of 100
Path: 020 out of 100
Path: 021 out of 100
Path: 022 out of 100
Path: 023 out of 100
Path: 024 out of 100
Path: 025 out of 100
Path: 026 out of 100
Path: 027 out of 100
Path: 028 out of 100
Path: 029 out of 100
Path: 030 out of 100
Path: 031 out of 100
Path: 032 out of 100
Path: 033 out of 100
Path: 034 out of 100
Path: 035 out of 100
Path: 036 out of 100
Path: 037 out of 100
Path: 038 out of 100
Path: 039 out of 100
Path: 040 out of 100
Path: 041 out of 100
Path: 042 out of 100
Path: 043 out of 100
Path: 044 out of 100
Path: 045 out of 100
Path: 046 out of 100
Path: 047 out

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


LassoCV(cv=5, verbose=1.4)

In [62]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

y_pred_linear = Lass_CV_model.predict(X_test)

print(f"R2 Score LassoCV : {r2_score(y_test,y_pred_linear)}")
print(f"MAE : {mean_absolute_error(y_test,y_pred_linear)}")
print(f"MSE : {mean_squared_error(y_test,y_pred_linear)}")

R2 Score LassoCV : 0.8410032034906063
MAE : 2.184388010512513
MSE : 8.95805813830282


##### **RidgeCV**

In [63]:
from sklearn.linear_model import RidgeCV

Ridge_CV_model = RidgeCV(cv=5)
Ridge_CV_model

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=5)

In [64]:
Ridge_CV_model.fit(X_train,y_train) 

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=5)

In [65]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

y_pred_linear = Ridge_CV_model.predict(X_test)

print(f"R2 Score Ridge CV : {r2_score(y_test,y_pred_linear)}")
print(f"MAE : {mean_absolute_error(y_test,y_pred_linear)}")
print(f"MSE : {mean_squared_error(y_test,y_pred_linear)}")

R2 Score Ridge CV : 0.8563494248817896
MAE : 2.1951288255207437
MSE : 8.09343478460296


##### **ElasticNetCV**

In [68]:
from sklearn.linear_model import ElasticNetCV

ElasticNet_CV_model = ElasticNetCV(cv=5)
ElasticNet_CV_model

ElasticNetCV(cv=5)

In [69]:
ElasticNet_CV_model.fit(X_train,y_train)

ElasticNetCV(cv=5)

In [70]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

y_pred_linear = ElasticNet_CV_model.predict(X_test)

print(f"R2 Score Elastic Net CV : {r2_score(y_test,y_pred_linear)}")
print(f"MAE : {mean_absolute_error(y_test,y_pred_linear)}")
print(f"MSE : {mean_squared_error(y_test,y_pred_linear)}")

R2 Score Elastic Net CV : 0.8269714661183813
MAE : 2.2939311790844146
MSE : 9.748621985633926


When you set `verbose=1.4` in `LassoCV`, it provides detailed information about the training process. Each cross-validation fold's progress is tracked, displaying how well the model performs with each set of data.

This level of detail can be especially useful if you want to see:
- Which fold of the cross-validation process is currently being evaluated.
- How the model's performance metrics change over time.

In `LassoCV`, this can help you understand if there's any inconsistency or irregularity in training, providing a transparent view of your model's training behavior.

Here's a snippet of how you might see the verbose output:

```python
LassoCV(cv=5, verbose=1.4)
```

This will output:
- Progress bar or detailed epoch information based on the library's settings.
- Regular updates during each cross-validation fold.

Verbose levels:
- `0`: Silent mode, no output.
- `1`: Progress bar for cross-validation steps.
- Values > `1`: More detailed logging.

If you have more specific questions about the output or need further assistance with your model training process, feel free to ask!