In [1]:
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

from datasets import diabetes_data
from tools import polynomial_features

In [2]:
original_X, original_y, train_X, train_y, test_X, test_y = diabetes_data()

## Degree 2

In [4]:
original_X_2 = polynomial_features(original_X, 2)

In [5]:
original_X_2.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,age^2,sex^2,bmi^2,bp^2,s1^2,s2^2,s3^2,s4^2,s5^2,s6^2
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,0.00145,0.002568,0.003806,0.000478,0.001956,0.001212,0.001884,7e-06,0.000396,0.000311
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,4e-06,0.001993,0.00265,0.000693,7.1e-05,0.000367,0.005537,0.00156,0.004669,0.008502
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,0.007276,0.002568,0.001976,3.2e-05,0.002079,0.001169,0.001047,7e-06,8e-06,0.000672
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,0.007932,0.001993,0.000134,0.001344,0.000149,0.000625,0.001299,0.001177,0.000515,8.8e-05
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,2.9e-05,0.001993,0.001324,0.000478,1.5e-05,0.000243,6.6e-05,7e-06,0.001023,0.002175


### Estimate test error by 5-fold cross validation

In [6]:
model = LinearRegression()
cross_val_score(model, original_X_2, original_y).mean()

0.48942715735968073

### Fit Polynomial Regression Model

In [7]:
# add constant, since statsmodels does not add it by defaul
original_X_2_const = sm.add_constant(original_X_2)
model = sm.OLS(original_y, original_X_2_const)

# fit model
result = model.fit()

In [8]:
result.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.546
Model:,OLS,Adj. R-squared:,0.526
Method:,Least Squares,F-statistic:,26.73
Date:,"Tue, 15 Jun 2021",Prob (F-statistic):,1.41e-60
Time:,15:25:41,Log-Likelihood:,-2372.6
No. Observations:,442,AIC:,4785.0
Df Residuals:,422,BIC:,4867.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,117.3834,9.360,12.541,0.000,98.985,135.782
age,41.8841,61.220,0.684,0.494,-78.451,162.219
sex,-242.3814,61.384,-3.949,0.000,-363.039,-121.724
bmi,454.3481,74.705,6.082,0.000,307.509,601.187
bp,328.3099,66.519,4.936,0.000,197.560,459.060
s1,-4802.4272,1535.710,-3.127,0.002,-7821.021,-1783.833
s2,4040.0481,1342.369,3.010,0.003,1401.486,6678.611
s3,1573.6215,609.074,2.584,0.010,376.425,2770.818
s4,125.2278,256.206,0.489,0.625,-378.372,628.827

0,1,2,3
Omnibus:,1.909,Durbin-Watson:,1.991
Prob(Omnibus):,0.385,Jarque-Bera (JB):,1.995
Skew:,0.147,Prob(JB):,0.369
Kurtosis:,2.853,Cond. No.,1.45e+18


## Degree 3

In [11]:
original_X_3 = polynomial_features(original_X, 3)

In [12]:
original_X_3.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,...,age^3,sex^3,bmi^3,bp^3,s1^3,s2^3,s3^3,s4^3,s5^3,s6^3
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,...,5.520148e-05,0.00013,0.000235,1.046373e-05,-8.648868e-05,-4.2e-05,-8.175128e-05,-1.741954e-08,7.890607e-06,-5.494752e-06
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,...,-6.666077e-09,-8.9e-05,-0.000136,-1.824927e-05,-6.030779e-07,-7e-06,0.0004120228,-6.159891e-05,-0.0003190284,-0.0007838807
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,...,0.0006206266,0.00013,8.8e-05,-1.823432e-07,-9.481539e-05,-4e-05,-3.387363e-05,-1.741954e-08,2.34863e-08,-1.743511e-05
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,...,-0.0007064657,-8.9e-05,-2e-06,-4.925509e-05,1.81164e-06,1.6e-05,-4.680222e-05,4.038488e-05,1.168476e-05,-8.205283e-07
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,...,1.559868e-07,-8.9e-05,-4.8e-05,1.046373e-05,6.092353e-08,4e-06,5.397674e-07,-1.741954e-08,-3.274173e-05,-0.0001014612


### Estimate test error by 5-fold cross validation

In [13]:
model = LinearRegression()
cross_val_score(model, original_X_3, original_y).mean()

0.47493785417180695

### Fit Polynomial Regression Model

In [15]:
# add constant, since statsmodels does not add it by defaul
original_X_3_const = sm.add_constant(original_X_3)
model = sm.OLS(original_y, original_X_3_const)

# fit model
result = model.fit()

In [17]:
result.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.559
Model:,OLS,Adj. R-squared:,0.529
Method:,Least Squares,F-statistic:,18.69
Date:,"Tue, 15 Jun 2021",Prob (F-statistic):,6.74e-57
Time:,15:27:01,Log-Likelihood:,-2366.3
No. Observations:,442,AIC:,4791.0
Df Residuals:,413,BIC:,4909.0
Df Model:,28,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,187.6968,39.393,4.765,0.000,110.261,265.132
age,244.3978,111.653,2.189,0.029,24.919,463.876
sex,-253.8569,62.152,-4.084,0.000,-376.030,-131.684
bmi,452.8019,96.063,4.714,0.000,263.968,641.635
bp,269.4348,108.599,2.481,0.013,55.958,482.911
s1,1.055e+04,8637.370,1.222,0.222,-6425.799,2.75e+04
s2,-9471.4426,7581.255,-1.249,0.212,-2.44e+04,5431.216
s3,-4253.2564,3238.525,-1.313,0.190,-1.06e+04,2112.793
s4,123.2270,274.865,0.448,0.654,-417.081,663.535

0,1,2,3
Omnibus:,2.228,Durbin-Watson:,2.012
Prob(Omnibus):,0.328,Jarque-Bera (JB):,2.309
Skew:,0.163,Prob(JB):,0.315
Kurtosis:,2.86,Cond. No.,1e+16
