<font color = green >

## Set interactive backend and import necessary packages
    
</font>

In [1]:
%matplotlib notebook

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

<font color = green >

## Load diabetes data set 
</font>

In [4]:
from sklearn.datasets import load_diabetes

In [15]:
feature_names = load_diabetes()['feature_names']
load_diabetes().DESCR

'.. _diabetes_dataset:\n\nDiabetes dataset\n----------------\n\nTen baseline variables, age, sex, body mass index, average blood\npressure, and six blood serum measurements were obtained for each of n =\n442 diabetes patients, as well as the response of interest, a\nquantitative measure of disease progression one year after baseline.\n\n**Data Set Characteristics:**\n\n  :Number of Instances: 442\n\n  :Number of Attributes: First 10 columns are numeric predictive values\n\n  :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n\n  :Attribute Information:\n      - age     age in years\n      - sex\n      - bmi     body mass index\n      - bp      average blood pressure\n      - s1      tc, total serum cholesterol\n      - s2      ldl, low-density lipoproteins\n      - s3      hdl, high-density lipoproteins\n      - s4      tch, total cholesterol / HDL\n      - s5      ltg, possibly log of serum triglycerides level\n      - s6      glu, blood sugar

In [6]:
def get_X_y(features= None, verbose= False):
    X, y = load_diabetes(return_X_y=True)

    if features is None:
        print ('Selecting all features')
        
    elif type(features) == int or (type(features) == list and len(features)==1):
        print ('Selecting one feature: {}'.format(features))
        X = X[:,features].reshape(-1,1) # single column 
    elif type(features) == list: 
        print ('Selecting features list: {}'.format(features))
        X = X[:,features]
    else: 
        print ('wrong format of parameter "features"')
        return


    X_train, X_test, y_train, y_test =  train_test_split(X, y, random_state=2021)
    if verbose:
        print ('X_train.shape= ',X_train.shape)
        print ('y_train.shape= ',y_train.shape)
        print ('X_train [:5] = \n{}'.format(X_train[:5]))
        print ('y_train [:5] = \n{}'.format(y_train[:5]))
    return X_train, X_test, y_train, y_test

In [7]:
# split to train and test
X_train, X_test, y_train, y_test = get_X_y(verbose= True)

Selecting all features
X_train.shape=  (331, 10)
y_train.shape=  (331,)
X_train [:5] = 
[[-0.06363517 -0.04464164 -0.03315126 -0.03321358  0.00118295  0.02405115
  -0.02499266 -0.00259226 -0.02251217 -0.05906719]
 [ 0.01264814 -0.04464164 -0.02560657 -0.04009932 -0.03046397 -0.04515466
   0.0780932  -0.0763945  -0.07212845  0.01134862]
 [ 0.03807591  0.05068012  0.00888341  0.04252958 -0.04284755 -0.02104223
  -0.03971921 -0.00259226 -0.01811827  0.00720652]
 [-0.07816532  0.05068012  0.07786339  0.05285819  0.07823631  0.0644473
   0.02655027 -0.00259226  0.04067226 -0.00936191]
 [-0.07453279 -0.04464164 -0.0105172  -0.00567061 -0.06623874 -0.0570543
  -0.00290283 -0.03949338 -0.0425721  -0.0010777 ]]
y_train [:5] = 
[214.  98. 127. 233. 168.]


In [8]:
# normalization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<font color='green'>

## Linear Regression

<font>

In [16]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lin_params = pd.Series(lin_reg.coef_, index=feature_names)

print('Linear Regression')
print ('R2 train score =', lin_reg.score(X_train, y_train))
print ('R2 test score =', lin_reg.score(X_test, y_test))
print (f'b: {lin_reg.intercept_}')
print(lin_params)

Linear Regression
R2 train score = 0.5073702774872662
R2 test score = 0.5281739698266457
b: 148.99287782144947
age    -19.685248
sex   -240.180431
bmi    557.914664
bp     251.500900
s1    -500.396234
s2     275.581227
s3     -11.607946
s4     154.014790
s5     651.170356
s6      77.512585
dtype: float64


<font color='green'>

## Ridge

<font>

In [17]:
ridge_reg = Ridge()
ridge_reg.fit(X_train, y_train)
ridge_params = pd.Series(ridge_reg.coef_, index=feature_names)

print('Ridge')
print ('R2 train score =', ridge_reg.score(X_train, y_train))
print ('R2 test score =', ridge_reg.score(X_test, y_test))
print (f'b: {ridge_reg.intercept_}')
print(ridge_params)

Ridge
R2 train score = 0.4227500042714355
R2 test score = 0.4342970082842498
b: 148.99988868218784
age     31.071354
sex    -67.812586
bmi    284.120464
bp     158.308117
s1      25.343029
s2     -14.631665
s3    -130.286878
s4     116.412804
s5     239.503502
s6     108.524335
dtype: float64


<font color='green'>

## Lasso

<font>

In [19]:
lasso_reg = Lasso()
lasso_reg.fit(X_train, y_train)
lasso_params = pd.Series(lasso_reg.coef_, index=feature_names)

print('Lasso')
print ('R2 train score =', lasso_reg.score(X_train, y_train))
print ('R2 test score =', lasso_reg.score(X_test, y_test))
print (f'b: {lasso_reg.intercept_}')
print(lasso_params)

Lasso
R2 train score = 0.366020102437113
R2 test score = 0.3392074106660581
b: 149.4852586610367
age      0.000000
sex     -0.000000
bmi    379.304704
bp       0.000000
s1       0.000000
s2       0.000000
s3      -0.000000
s4       0.000000
s5     317.427638
s6       0.000000
dtype: float64


<font color = green >

## Polynomial + Linear Regression

</font>

In [20]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

<font color = green >

### scale features

</font>

In [21]:
scaler = StandardScaler()
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)

In [22]:
poly_lin_reg = LinearRegression()
poly_lin_reg.fit(X_train_poly, y_train)

print ('Polynomial + Linear Regression')
print ('R2 train score =', poly_lin_reg.score(X_train_poly, y_train))
print ('R2 test score =', poly_lin_reg.score(X_test_poly, y_test))
print (f'b: {poly_lin_reg.intercept_}')

Polynomial + Linear Regression
R2 train score = 0.6207797301635398
R2 test score = 0.3471990138057319
b: 56.795251863713204


In [23]:
# with scaled features
poly_lin_reg_scaled = LinearRegression()
poly_lin_reg_scaled.fit(X_train_poly_scaled, y_train)

print ('Polynomial + Linear Regression')
print ('R2 train score =', poly_lin_reg_scaled.score(X_train_poly_scaled, y_train))
print ('R2 test score =', poly_lin_reg_scaled.score(X_test_poly_scaled, y_test))
print (f'b: {poly_lin_reg_scaled.intercept_}')

Polynomial + Linear Regression
R2 train score = 0.6173506235243134
R2 test score = 0.3612960569288898
b: 165.13225465251435


<font color = green >

## Polynomial + Ridge

</font>

In [24]:
poly_ridge = Ridge()
poly_ridge.fit(X_train_poly, y_train)

print ('Polynomial + Ridge')
print ('R2 train score =', poly_ridge.score(X_train_poly, y_train))
print ('R2 test score =', poly_ridge.score(X_test_poly, y_test))

Polynomial + Ridge
R2 train score = 0.423626095551196
R2 test score = 0.4344256260511967


<font color = green >

## Polynomial + Lasso

</font>

In [25]:
poly_ridge = Lasso()
poly_ridge.fit(X_train_poly, y_train)

print ('Polynomial + Ridge')
print ('R2 train score =', poly_ridge.score(X_train_poly, y_train))
print ('R2 test score =', poly_ridge.score(X_test_poly, y_test))

Polynomial + Ridge
R2 train score = 0.366020102437113
R2 test score = 0.3392074106660581
