In [None]:
# Copyright 2023 The ML Notebooks Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Ridge Regression


This notebook uses the classic Auto MPG dataset and demonstrates how to build a linear regression model with L2 regularization (i.e. ridge regression) to predict the fuel efficiency of the late-1970s and early 1980s automobiles.


## Setup


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import (
    GridSearchCV,
    cross_val_score,
    train_test_split,
)
from sklearn.preprocessing import StandardScaler

print("Finished importing...")

## Data Collection


### The Auto MPG dataset

The dataset is available from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/).


### Get the data

Download and import the dataset using pandas.


In [None]:
# pylint: disable-next=line-too-long
path = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"

column_names = [
    "MPG",
    "Cylinders",
    "Displacement",
    "Horsepower",
    "Weight",
    "Acceleration",
    "Model Year",
    "Origin",
]

df = pd.read_csv(
    path,
    names=column_names,
    na_values="?",
    comment="\t",
    sep=" ",
    skipinitialspace=True,
)
df.head()

## Exploratory Data Analysis (EDA)


In [None]:
sns.pairplot(df[["MPG", "Cylinders", "Displacement", "Weight"]])

In [None]:
df.describe().transpose()

## Data Preparation


### Clean the data

Check data types.


In [None]:
df.info()

Check missing/unknown values.


In [None]:
df.isna().sum()

Drop rows with unknown values.


In [None]:
df = df.dropna()

The `"Origin"` column is categorical, not numeric. So, the next step is to one-hot encode the values.


In [None]:
df = pd.get_dummies(df, prefix=["Origin"], columns=["Origin"], dummy_na=False)
df.head()

### Split features from labels


In [None]:
features = df.drop(columns=["MPG"])
labels = df["MPG"]

### Split the data into development and test sets

Now split the dataset into a development set and a test set. You will use the test set in the final evaluation of your models.


In [None]:
dev_features, test_features, dev_labels, test_labels = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

### Normalize the data


In [None]:
scaler = StandardScaler()
dev_features = scaler.fit_transform(dev_features)
test_features = scaler.transform(test_features)

## Build a ridge regression model


In [None]:
class RidgeRegression:
    # pylint: disable=invalid-name
    """
    Linear regression model with L2-regularization (i.e. ridge regression).
    """

    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.coef_ = None
        self.intercept_ = None

    def fit(self, X, y):
        # Copy X and add a column of 1s
        X_ = X.copy()
        X_ = np.hstack([np.ones((X_.shape[0], 1)), X_])

        I = np.identity(X_.shape[1])
        I[0, 0] = 0

        w = np.linalg.inv(X_.T @ X_ + self.alpha * I) @ X_.T @ y

        self.intercept_ = w[0]
        self.coef_ = w[1:]

    def predict(self, X):
        return X.dot(self.coef_) + self.intercept_

    def score(self, X, y):
        predictions = X.dot(self.coef_) + self.intercept_
        return r2_score(y, predictions)

    def get_params(self, deep=True):
        # pylint: disable=unused-argument
        return {"alpha": self.alpha}

    def set_params(self, **params):
        # pylint: disable=redefined-outer-name
        for k, v in params.items():
            if k == "alpha":
                self.alpha = v

        return self


model = RidgeRegression()

## Train and evaluate your model


Evaluate model performance with default hyperparameters using cross-validation.


In [None]:
scores = cross_val_score(
    model, dev_features, dev_labels, scoring="r2", cv=5, error_score="raise"
)

print("R^2 scores:\n", scores)
print("Mean R^2:\n", scores.mean())

Hyperparameter tuning using grid search


In [None]:
params = {"alpha": np.logspace(-5, 1, 20)}

reg = GridSearchCV(model, param_grid=params, scoring="r2", cv=5)
reg.fit(dev_features, dev_labels)

best_params = reg.best_params_

print("Best score:", reg.best_score_)
print("Best params:", best_params)

Fit a ridge regression model using the best hyperparameters.


In [None]:
model.set_params(**best_params)
model.fit(dev_features, dev_labels)

print("Coefficients:\n", model.coef_)
print("Intercept:\n", model.intercept_)

Evaluate your model on the test data.


In [None]:
score = model.score(test_features, test_labels)
print("R^2:\n", score)

In [None]:
test_predictions = model.predict(test_features)

plt.scatter(test_labels, test_predictions)
plt.xlabel("True Values [MPG]")
plt.ylabel("Predictions [MPG]")
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
plt.plot(lims, lims)

In [None]:
mse = mean_squared_error(test_labels, test_predictions)
print("MSE:\n", mse)

error = test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel("Prediction Error [MPG]")
plt.ylabel("Count")

Feature importance obtained from coefficients.


In [None]:
feature_importance_df = pd.DataFrame(
    data={"Attribute": features.columns, "Importance": model.coef_}
)
feature_importance_df = feature_importance_df.sort_values(
    by="Importance", ascending=False
)

plt.figure(figsize=(10, 6))
ax = sns.barplot(feature_importance_df, x="Attribute", y="Importance")
ax.tick_params(axis="x", rotation=45)

## Conclusion

You have trained a linear regression model with L2 regularization (i.e. ridge regression).
