## Diabetes dataset

https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html

The dataset contains 10 features (that have already been mean centered and scaled) and a target value: a measure of disease progression one year after baseline.

In [None]:
# load the dataset
import pandas as pd
from sklearn.datasets import load_diabetes
df = load_diabetes()
X = df.data
y = df.target
print(X.shape)
print(y.shape)

(442, 10)
(442,)


In [None]:
# setting up training set and test set 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)


In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(331, 10)
(331,)
(111, 10)
(111,)


In [None]:
y_train


array([ 68., 109.,  94., 118., 275., 275., 127., 281.,  71.,  42.,  71.,
       128., 272., 135.,  51., 220., 167.,  78., 131., 212., 182., 174.,
       259.,  77.,  91., 310.,  84., 134., 102., 128., 306., 245., 201.,
       183., 111.,  96., 125., 182., 177.,  48.,  97., 259., 288., 242.,
        69.,  31., 154., 150.,  52., 261., 118., 102., 139.,  51.,  58.,
       144., 178.,  97.,  78., 129., 258., 124., 198., 185.,  66., 237.,
       178., 275., 268., 242., 200., 214., 246., 236.,  85., 114.,  93.,
        99.,  72., 270., 111.,  83.,  87.,  42., 172.,  65., 259., 279.,
       141., 144., 220.,  90., 101.,  53.,  67.,  72., 121., 303., 232.,
       140., 190., 221.,  71., 116., 111., 280., 233.,  78., 150., 283.,
        64., 140.,  65., 225., 206.,  63., 296., 173.,  85., 141.,  50.,
        25., 153.,  55., 139., 336.,  73.,  95., 109.,  44., 180., 263.,
       148.,  79.,  65., 102., 220., 277., 246., 200., 262., 191.,  97.,
       184.,  85., 248., 150., 268.,  59.,  70.,  8

In [None]:
# fit a linear regression model on training data and determine training set score and test set score
# ANSWER:

from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)

model.fit(X_train, y_train)

LinearRegression()

In [None]:
model.coef_

array([ -43.26774487, -208.67053951,  593.39797213,  302.89814903,
       -560.27689824,  261.47657106,   -8.83343952,  135.93715156,
        703.22658427,   28.34844354])

In [None]:
model.intercept_

153.06798218266258

In [None]:
print("Model slope:    ", model.coef_[0])
print("Model intercept:", model.intercept_)
print("R^2 value for training set:", model.score(X_train,y_train))
print("R^2 value for test set:", model.score(X_test,y_test))



Model slope:     -43.26774487031572
Model intercept: 153.06798218266258
R^2 value for training set: 0.555437148935302
R^2 value for test set: 0.35940090989715534


In [None]:
# fit a Lasso regresion model with default value for alpha=1
# determine training set score and test set score
# find optimal alpha with grid search over the following list
alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

from sklearn.linear_model import Lasso


# ANSWER:
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.001)

reg.fit(X_train,y_train)
print("Lasso regression coefficients", reg.coef_)
print("Lasso regression intercept", reg.intercept_)

print("Training set score: {:.2f}".format(reg.score(X_train, y_train)))
print("Test set score: {:.2f}".format(reg.score(X_test, y_test)))
print("Number of features used under lasso regression:", np.sum(reg.coef_ !=0))

Lasso regression coefficients [ -42.21298561 -207.49613042  594.22301174  301.66896349 -515.50921726
  226.19941607  -28.47772917  129.43346834  686.65194314   28.08013654]
Lasso regression intercept 153.05975628987463
Training set score: 0.56
Test set score: 0.36
Number of features used under lasso regression: 10


In [None]:
from sklearn.model_selection import GridSearchCV

Parameters = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}

gslasso = GridSearchCV(estimator=reg, param_grid = Parameters)
gslasso = gslasso.fit(X_train, y_train)
gslasso.best_params_

{'alpha': 0.1}

In [None]:
lb = linear_model.Lasso(alpha=0.1)

lb.fit(X_train,y_train)
print("Lasso regression coefficients", lb.coef_)
print("Lasso regression intercept", lb.intercept_)

print("Training set score: {:.2f}".format(lb.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lb.score(X_test, y_test)))
print("Number of features used under lasso regression:", np.sum(lb.coef_ !=0))

Lasso regression coefficients [  -0.         -129.78400011  592.20328049  240.12404875  -41.64058526
  -47.62797321 -219.10436344    0.          507.36252305    0.        ]
Lasso regression intercept 152.98795183809946
Training set score: 0.55
Test set score: 0.36
Number of features used under lasso regression: 7
