In [1]:
from datasets import diabetes_data

import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

import pandas as pd

import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [2]:
original_X, original_y, train_X, train_y, test_X, test_y = diabetes_data()

## Simple Linear Regression

In [3]:
model = LinearRegression()

In [4]:
subset_X = pd.DataFrame(original_X['bmi'].values.reshape(-1, 1), columns=['bmi'])

In [5]:
model = model.fit(subset_X, original_y)

In [6]:
predicted_y = model.predict(subset_X)

In [10]:
data = subset_X.copy()
data['diabetes risk'] = original_y

prediction = subset_X.copy()
prediction['diabetes risk'] = predicted_y

fig = px.scatter(data, x='bmi', y='diabetes risk')
fig.add_trace(px.line(prediction, x='bmi', y='diabetes risk').data[0])
fig.update_traces(line_color='red')
fig.show()

## Estimate test error by 5-fold cross validation

In [11]:
model = LinearRegression()
cross_val_score(model, original_X, original_y).mean()

0.48231812211149394

## Fit Linear Regression model

In [12]:
# add constant, since statsmodels does not add it by default
original_X_const = sm.add_constant(original_X)
model = sm.OLS(original_y, original_X_const)

# fit model
result = model.fit()

In [13]:
result.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.518
Model:,OLS,Adj. R-squared:,0.507
Method:,Least Squares,F-statistic:,46.27
Date:,"Tue, 15 Jun 2021",Prob (F-statistic):,3.8299999999999998e-62
Time:,17:02:08,Log-Likelihood:,-2386.0
No. Observations:,442,AIC:,4794.0
Df Residuals:,431,BIC:,4839.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,152.1335,2.576,59.061,0.000,147.071,157.196
age,-10.0122,59.749,-0.168,0.867,-127.448,107.424
sex,-239.8191,61.222,-3.917,0.000,-360.151,-119.488
bmi,519.8398,66.534,7.813,0.000,389.069,650.610
bp,324.3904,65.422,4.958,0.000,195.805,452.976
s1,-792.1842,416.684,-1.901,0.058,-1611.169,26.801
s2,476.7458,339.035,1.406,0.160,-189.621,1143.113
s3,101.0446,212.533,0.475,0.635,-316.685,518.774
s4,177.0642,161.476,1.097,0.273,-140.313,494.442

0,1,2,3
Omnibus:,1.506,Durbin-Watson:,2.029
Prob(Omnibus):,0.471,Jarque-Bera (JB):,1.404
Skew:,0.017,Prob(JB):,0.496
Kurtosis:,2.726,Cond. No.,227.0
