In [16]:
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

import pandas as pd

from datasets import diabetes_data
from tools import polynomial_features
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [17]:
original_X, original_y, train_X, train_y, test_X, test_y = diabetes_data()

## Degree 2

In [18]:
original_X_2 = polynomial_features(original_X, 2)

In [19]:
original_X_2 = pd.DataFrame(StandardScaler().fit_transform(original_X_2), columns=original_X_2.columns)

In [20]:
original_X_2.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,age^2,sex^2,bmi^2,bp^2,s1^2,s2^2,s3^2,s4^2,s5^2,s6^2
0,0.8005,1.065488,1.297088,0.45984,-0.929746,-0.732065,-0.912451,-0.054499,0.418551,-0.370989,-0.312312,1.065488,0.473132,-0.652677,-0.091057,-0.288865,-0.09737,-0.640142,-0.605826,-0.578692
1,-0.039567,-0.938537,-1.08218,-0.553511,-0.177624,-0.402886,1.564414,-0.830301,-1.436551,-1.938479,-0.868105,-0.938537,0.118633,-0.574111,-0.650453,-0.521412,0.84172,-0.199421,0.78127,1.850556
2,1.793307,1.065488,0.934533,-0.119218,-0.958674,-0.718897,-0.680245,-0.054499,0.060207,-0.545154,1.926693,1.065488,-0.087804,-0.815932,-0.054366,-0.300757,-0.312444,-0.640142,-0.731837,-0.471619
3,-1.872441,-0.938537,-0.243771,-0.770658,0.256292,0.525397,-0.757647,0.721302,0.477072,-0.196823,2.178913,-0.938537,-0.652097,-0.336116,-0.627526,-0.450624,-0.247721,-0.308006,-0.567329,-0.645054
4,0.113172,-0.938537,-0.764944,0.45984,0.082726,0.32789,0.171178,-0.054499,-0.672582,-0.980568,-0.85833,-0.938537,-0.287621,-0.652677,-0.667047,-0.555525,-0.564503,-0.640142,-0.402237,-0.025826


### Estimate test error by 5-fold cross validation

In [23]:
model = LinearRegression()
cross_val_score(model, original_X_2, original_y).mean()

0.4890330444830031

### Fit Polynomial Regression Model

In [24]:
# add constant, since statsmodels does not add it by defaul
original_X_2_const = sm.add_constant(original_X_2)
model = sm.OLS(original_y, original_X_2_const)

# fit model
result = model.fit()

In [25]:
result.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.546
Model:,OLS,Adj. R-squared:,0.526
Method:,Least Squares,F-statistic:,26.73
Date:,"Wed, 16 Jun 2021",Prob (F-statistic):,1.41e-60
Time:,10:53:00,Log-Likelihood:,-2372.6
No. Observations:,442,AIC:,4785.0
Df Residuals:,422,BIC:,4867.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,152.1335,2.525,60.243,0.000,147.170,157.097
age,1.9922,2.912,0.684,0.494,-3.732,7.716
sex,-5.7646,1.460,-3.949,0.000,-8.634,-2.895
bmi,21.6111,3.553,6.082,0.000,14.627,28.596
bp,15.6161,3.164,4.936,0.000,9.397,21.835
s1,-228.4282,73.046,-3.127,0.002,-372.008,-84.848
s2,192.1655,63.850,3.010,0.003,66.662,317.669
s3,74.8495,28.971,2.584,0.010,17.905,131.794
s4,5.9565,12.186,0.489,0.625,-17.997,29.910

0,1,2,3
Omnibus:,1.909,Durbin-Watson:,1.991
Prob(Omnibus):,0.385,Jarque-Bera (JB):,1.995
Skew:,0.147,Prob(JB):,0.369
Kurtosis:,2.853,Cond. No.,6360000000000000.0


## Degree 3

In [45]:
original_X_3 = polynomial_features(original_X, 3)

In [46]:
original_X_3 = pd.DataFrame(StandardScaler().fit_transform(original_X_3), columns=original_X_3.columns)

In [47]:
original_X_3.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,...,age^3,sex^3,bmi^3,bp^3,s1^3,s2^3,s3^3,s4^3,s5^3,s6^3
0,0.8005,1.065488,1.297088,0.45984,-0.929746,-0.732065,-0.912451,-0.054499,0.418551,-0.370989,...,0.275032,1.065488,0.377399,-0.064821,-0.286006,-0.16187,-0.273916,-0.153371,-0.060035,-0.063947
1,-0.039567,-0.938537,-1.08218,-0.553511,-0.177624,-0.402886,1.564414,-0.830301,-1.436551,-1.938479,...,0.085271,-0.938537,-0.443382,-0.154693,-0.092651,-0.097913,0.533708,-0.273101,-0.898721,-1.854866
2,1.793307,1.065488,0.934533,-0.119218,-0.958674,-0.718897,-0.680245,-0.054499,0.060207,-0.545154,...,2.2185,1.065488,0.05236,-0.098143,-0.304752,-0.157803,-0.195606,-0.153371,-0.080217,-0.09142
3,-1.872441,-0.938537,-0.243771,-0.770658,0.256292,0.525397,-0.757647,0.721302,0.477072,-0.196823,...,-2.342956,-0.938537,-0.145283,-0.251742,-0.087215,-0.056747,-0.216753,-0.074819,-0.050301,-0.053193
4,0.113172,-0.938537,-0.764944,0.45984,0.082726,0.32789,0.171178,-0.054499,-0.672582,-0.980568,...,0.08583,-0.938537,-0.248335,-0.064821,-0.091156,-0.078224,-0.139319,-0.153371,-0.164274,-0.284748


## Polynomial Simple Linear Regression

In [57]:
model = LinearRegression()

In [58]:
subset_X = pd.DataFrame(original_X_3[['bmi', 'bmi^2', 'bmi^3']], columns=['bmi', 'bmi^2', 'bmi^3'])

In [59]:
model = model.fit(subset_X, original_y)

In [60]:
predicted_y = model.predict(subset_X)

In [61]:
data = subset_X.copy()
data['diabetes risk'] = original_y

prediction = subset_X.copy()
prediction['diabetes risk'] = predicted_y
prediction = prediction.sort_values(['diabetes risk'])

fig = px.scatter(data, x='bmi', y='diabetes risk')
fig.add_trace(px.line(prediction, x='bmi', y='diabetes risk').data[0])
fig.update_traces(line_color='red')
fig.show()

### Estimate test error by 5-fold cross validation

In [29]:
model = LinearRegression()
cross_val_score(model, original_X_3, original_y).mean()

0.47460105616499604

### Fit Polynomial Regression Model

In [30]:
# add constant, since statsmodels does not add it by defaul
original_X_3_const = sm.add_constant(original_X_3)
model = sm.OLS(original_y, original_X_3_const)

# fit model
result = model.fit()

In [31]:
result.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.559
Model:,OLS,Adj. R-squared:,0.529
Method:,Least Squares,F-statistic:,18.69
Date:,"Wed, 16 Jun 2021",Prob (F-statistic):,6.74e-57
Time:,10:53:00,Log-Likelihood:,-2366.3
No. Observations:,442,AIC:,4791.0
Df Residuals:,413,BIC:,4909.0
Df Model:,28,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,152.1335,2.517,60.449,0.000,147.186,157.081
age,11.6248,5.311,2.189,0.029,1.185,22.064
sex,-4.0250,0.985,-4.084,0.000,-5.962,-2.088
bmi,21.5376,4.569,4.714,0.000,12.556,30.519
bp,12.8157,5.166,2.481,0.013,2.662,22.970
s1,501.9499,410.838,1.222,0.222,-305.644,1309.544
s2,-450.5106,360.604,-1.249,0.212,-1159.358,258.337
s3,-202.3068,154.041,-1.313,0.190,-505.109,100.495
s4,5.8613,13.074,0.448,0.654,-19.839,31.561

0,1,2,3
Omnibus:,2.228,Durbin-Watson:,2.012
Prob(Omnibus):,0.328,Jarque-Bera (JB):,2.309
Skew:,0.163,Prob(JB):,0.315
Kurtosis:,2.86,Cond. No.,1.41e+16
