# Train a Ridge Regression Model on the Diabetes Dataset

This notebook loads the Diabetes dataset from sklearn, splits the data into training and validation sets, trains a Ridge regression model, validates the model on the validation set, and saves the model.

In [1]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import joblib
import pandas as pd

## Load Data

In [6]:
sample_data = load_diabetes()

df = pd.DataFrame(
    data=sample_data.data,
    columns=sample_data.feature_names)
df['Y'] = sample_data.target

In [7]:
print(df.shape)

(442, 10)


In [11]:
# All data in a single dataframe
df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Y
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-3.634285e-16,1.308343e-16,-8.045349e-16,1.281655e-16,-8.835316000000001e-17,1.327024e-16,-4.574646e-16,3.777301e-16,-3.830854e-16,-3.412882e-16,152.133484
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,77.093005
min,-0.1072256,-0.04464164,-0.0902753,-0.1123996,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260974,-0.1377672,25.0
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665645,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324879,-0.03317903,87.0
50%,0.00538306,-0.04464164,-0.007283766,-0.005670611,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947634,-0.001077698,140.5
75%,0.03807591,0.05068012,0.03124802,0.03564384,0.02835801,0.02984439,0.0293115,0.03430886,0.03243323,0.02791705,211.5
max,0.1107267,0.05068012,0.1705552,0.1320442,0.1539137,0.198788,0.1811791,0.1852344,0.133599,0.1356118,346.0


## Split Data into Training and Validation Sets

In [12]:
X = df.drop('Y', axis=1).values
y = df['Y'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)
data = {"train": {"X": X_train, "y": y_train},
        "test": {"X": X_test, "y": y_test}}

## Train Model on Training Set

In [16]:
# experiment parameters
args = {
    "alpha": 0.5
}

reg_model = Ridge(**args)
reg_model.fit(data["train"]["X"], data["train"]["y"])

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

## Validate Model on Validation Set

In [18]:
preds = reg_model.predict(data["test"]["X"])
mse = mean_squared_error(preds, y_test)
metrics = {"mse": mse}
print(metrics)

{'mse': 3298.9096058070622}


## Save Model

In [7]:
model_name = "sklearn_regression_model.pkl"

joblib.dump(value=reg, filename=model_name)

['sklearn_regression_model.pkl']