Copyright (c) Microsoft Corporation. All rights reserved.  
Licensed under the MIT License.

# Perform a regression on the diabetes data

## Import libraries

In [None]:
import pickle
import os
import argparse
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
import numpy as np
import json
import subprocess
from typing import Tuple, List

## Load diabetes dataset

In [None]:
X, y = load_diabetes(return_X_y=True)
columns = ["age", "gender", "bmi", "bp", "s1", "s2", "s3", "s4", "s5", "s6"]

## Prepare data

In [None]:
# X = X[:,0:5]

## Split the data into a training and test dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
data = {"train": {"X": X_train, "y": y_train}, "test": {"X": X_test, "y": y_test}}

print ("Data contains", len(data['train']['X']), "training samples and",len(data['test']['X']), "test samples")

In [None]:
X_train.shape, X_test.shape

## Set a random alpha (regularization strength) for the Ridge regression

In [None]:
alphas = np.arange(0.0, 1.0, 0.05)
alpha = alphas[np.random.choice(alphas.shape[0], 1, replace=False)][0]
print(alpha)

## Create a Ridge regression and train on the training dataset

In [None]:
reg = Ridge(alpha=alpha)
reg.fit(data["train"]["X"], data["train"]["y"])

## Save the trained model to disk

In [None]:
model_name = "sklearn_regression_model.pkl"

with open(model_name, "wb") as file:
    joblib.dump(value=reg, filename=model_name)

## Load the model from disk

In [None]:
reg2 = joblib.load(model_name)

## Perform inference on the trained model and print the MSE (mean squared error)

In [None]:
preds = reg2.predict(data["test"]["X"])
print("mse", mean_squared_error(preds, data["test"]["y"]))

In [None]:
preds

# Next:

[Configure Azure ML](./01-aml-configuration.ipynb)