In [74]:
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
import numpy as np

In [78]:
# https://datahub.io/machine-learning/diabetes

diabetes_df = pd.read_csv("diabetes_csv.csv")
diabetes_df.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,label
0,6,148,72,35,0,33.6,0.627,50,tested_positive
1,1,85,66,29,0,26.6,0.351,31,tested_negative
2,8,183,64,0,0,23.3,0.672,32,tested_positive
3,1,89,66,23,94,28.1,0.167,21,tested_negative
4,0,137,40,35,168,43.1,2.288,33,tested_positive


In [80]:
diabetes_df['label'] = diabetes_df['label'].replace({"tested_positive":1, "tested_negative": 0})
diabetes_df['label'].astype(str).astype(int)

diabetes_df.dtypes

preg       int64
plas       int64
pres       int64
skin       int64
insu       int64
mass     float64
pedi     float64
age        int64
label      int64
dtype: object

# Simple Logistic Regression
## dependent variable : label

## label ~ preg

In [84]:
print(diabetes_df.label)
model = sm.GLM.from_formula("label ~ preg", family = sm.families.Binomial(), data = diabetes_df)
res = model.fit()
res.summary()

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: label, Length: 768, dtype: int64


0,1,2,3
Dep. Variable:,label,No. Observations:,768.0
Model:,GLM,Df Residuals:,766.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-478.1
Date:,"Thu, 10 Nov 2022",Deviance:,956.21
Time:,14:52:05,Pearson chi2:,770.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.1767,0.123,-9.558,0.000,-1.418,-0.935
preg,0.1372,0.023,5.986,0.000,0.092,0.182


In [85]:
model_stats = pd.DataFrame(np.exp(res.params), columns = ["OddsRatio"])
model_stats["P >|z|"] = res.pvalues
model_stats[["2.5%", "97.5%"]] = np.exp(res.conf_int())
model_stats

Unnamed: 0,OddsRatio,P >|z|,2.5%,97.5%
Intercept,0.308279,1.205003e-21,0.242183,0.392415
preg,1.147008,2.147445e-09,1.096639,1.199691


log-odds = 0.308279 + 1.147008x
p value of preg is 2.147445e-09 which is less than 0.05, so we reject the null hypothesis. Also, condidence interval of OR:[1.096639, 1.199691]does not contian 1. preg have siginificant effect on wether should have label.

# label ~ plas

In [86]:
model = sm.GLM.from_formula("label ~ plas", family = sm.families.Binomial(), data = diabetes_df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,768.0
Model:,GLM,Df Residuals:,766.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-404.36
Date:,"Thu, 10 Nov 2022",Deviance:,808.72
Time:,15:21:03,Pearson chi2:,1140.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-5.3501,0.421,-12.713,0.000,-6.175,-4.525
plas,0.0379,0.003,11.647,0.000,0.031,0.044


In [87]:
model_stats = pd.DataFrame(np.exp(res.params), columns = ["OddsRatio"])
model_stats["P >|z|"] = res.pvalues
model_stats[["2.5%", "97.5%"]] = np.exp(res.conf_int())
model_stats

Unnamed: 0,OddsRatio,P >|z|,2.5%,97.5%
Intercept,0.004748,4.998114e-37,0.002081,0.010832
plas,1.038599,2.380722e-31,1.032001,1.04524


log-odds = 0.004748 + 1.038599x
p value of plas is 2.380722e-31 which is less than 0.05, so we reject the null hypothesis. Also, condidence interval of OR:[1.032001, 1.045240]does not contian 1. plas have siginificant effect on wether should have label.

# label ~ pres

In [89]:
model = sm.GLM.from_formula("label ~ pres", family = sm.families.Binomial(), data = diabetes_df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,768.0
Model:,GLM,Df Residuals:,766.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-495.06
Date:,"Thu, 10 Nov 2022",Deviance:,990.13
Time:,15:21:36,Pearson chi2:,770.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.1401,0.300,-3.803,0.000,-1.728,-0.552
pres,0.0074,0.004,1.793,0.073,-0.001,0.016


In [90]:
model_stats = pd.DataFrame(np.exp(res.params), columns = ["OddsRatio"])
model_stats["P >|z|"] = res.pvalues
model_stats[["2.5%", "97.5%"]] = np.exp(res.conf_int())
model_stats

Unnamed: 0,OddsRatio,P >|z|,2.5%,97.5%
Intercept,0.31979,0.000143,0.177687,0.575537
pres,1.007452,0.072994,0.999308,1.015663


log-odds = 0.319790 + 1.007452x
p value of pres is 0.072994 which is greater than 0.05, so we do not reject the null hypothesis. Also, condidence interval of OR:[0.999308, 1.015663] contians 1. pres have siginificant effect on wether should have label.

# label ~ skin

In [91]:
model = sm.GLM.from_formula("label ~ skin", family = sm.families.Binomial(), data = diabetes_df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,768.0
Model:,GLM,Df Residuals:,766.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-494.59
Date:,"Thu, 10 Nov 2022",Deviance:,989.19
Time:,15:22:30,Pearson chi2:,769.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.8299,0.127,-6.544,0.000,-1.078,-0.581
skin,0.0099,0.005,2.066,0.039,0.001,0.019


In [92]:
model_stats = pd.DataFrame(np.exp(res.params), columns = ["OddsRatio"])
model_stats["P >|z|"] = res.pvalues
model_stats[["2.5%", "97.5%"]] = np.exp(res.conf_int())
model_stats

Unnamed: 0,OddsRatio,P >|z|,2.5%,97.5%
Intercept,0.436113,5.999004e-11,0.340137,0.559172
skin,1.009911,0.03881576,1.000507,1.019404


log-odds = 0.436113 + 1.009911x
p value of skin is 3.881576e-02 which is greater than 0.05, so we do not reject the null hypothesis. Also, condidence interval of OR:[1.000507, 1.019404] do not contians 1. skin have siginificant effect on wether should have label.

# label ~ insu

In [101]:
model = sm.GLM.from_formula("label ~ insu", family = sm.families.Binomial(), data = diabetes_df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,768.0
Model:,GLM,Df Residuals:,766.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-490.41
Date:,"Thu, 10 Nov 2022",Deviance:,980.81
Time:,15:30:04,Pearson chi2:,769.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.8145,0.094,-8.632,0.000,-0.999,-0.630
insu,0.0023,0.001,3.518,0.000,0.001,0.004


In [102]:
model_stats = pd.DataFrame(np.exp(res.params), columns = ["OddsRatio"])
model_stats["P >|z|"] = res.pvalues
model_stats[["2.5%", "97.5%"]] = np.exp(res.conf_int())
model_stats

Unnamed: 0,OddsRatio,P >|z|,2.5%,97.5%
Intercept,0.442856,6.024111e-18,0.368082,0.53282
insu,1.002301,0.0004353455,1.001019,1.003586


log-odds = 0.442856	 + 1.002301x
p value of insu is 4.353455e-04 which is less than 0.05, so we reject the null hypothesis. Also, condidence interval of OR:[1.001019, 1.003586] do not contians 1. insu have siginificant effect on wether should have label.

# label ~ mass

In [93]:
model = sm.GLM.from_formula("label ~ mass", family = sm.families.Binomial(), data = diabetes_df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,768.0
Model:,GLM,Df Residuals:,766.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-460.36
Date:,"Thu, 10 Nov 2022",Deviance:,920.71
Time:,15:23:03,Pearson chi2:,805.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.6864,0.409,-9.014,0.000,-4.488,-2.885
mass,0.0935,0.012,7.761,0.000,0.070,0.117


In [94]:
model_stats = pd.DataFrame(np.exp(res.params), columns = ["OddsRatio"])
model_stats["P >|z|"] = res.pvalues
model_stats[["2.5%", "97.5%"]] = np.exp(res.conf_int())
model_stats

Unnamed: 0,OddsRatio,P >|z|,2.5%,97.5%
Intercept,0.025062,1.9839349999999999e-19,0.011244,0.055862
mass,1.098044,8.449577e-15,1.072411,1.12429


log-odds = 0.025062	 + 1.098044x
p value of mass is 8.449577e-15 which is less than 0.05, so we reject the null hypothesis. Also, condidence interval of OR:[1.072411, 1.124290] do not contians 1. mass have siginificant effect on wether should have label.

# label ~ pedi

In [95]:
model = sm.GLM.from_formula("label ~ pedi", family = sm.families.Binomial(), data = diabetes_df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,768.0
Model:,GLM,Df Residuals:,766.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-485.43
Date:,"Thu, 10 Nov 2022",Deviance:,970.86
Time:,15:23:37,Pearson chi2:,767.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.1473,0.138,-8.319,0.000,-1.418,-0.877
pedi,1.0828,0.234,4.627,0.000,0.624,1.541


In [96]:
model_stats = pd.DataFrame(np.exp(res.params), columns = ["OddsRatio"])
model_stats["P >|z|"] = res.pvalues
model_stats[["2.5%", "97.5%"]] = np.exp(res.conf_int())
model_stats

Unnamed: 0,OddsRatio,P >|z|,2.5%,97.5%
Intercept,0.317478,8.893610000000001e-17,0.242277,0.416021
pedi,2.953073,3.702926e-06,1.866751,4.67156


log-odds = 0.317478	 + 2.953073x
p value of pedi is 3.702926e-06 which is less than 0.05, so we reject the null hypothesis. Also, condidence interval of OR:[1.866751, 4.671560] do not contians 1. pedi have siginificant effect on wether should have label.

# label ~ age

In [98]:
model = sm.GLM.from_formula("label ~ age", family = sm.families.Binomial(), data = diabetes_df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,768.0
Model:,GLM,Df Residuals:,766.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-475.36
Date:,"Thu, 10 Nov 2022",Deviance:,950.72
Time:,15:24:09,Pearson chi2:,761.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.0475,0.239,-8.572,0.000,-2.516,-1.579
age,0.0420,0.007,6.380,0.000,0.029,0.055


In [99]:
model_stats = pd.DataFrame(np.exp(res.params), columns = ["OddsRatio"])
model_stats["P >|z|"] = res.pvalues
model_stats[["2.5%", "97.5%"]] = np.exp(res.conf_int())
model_stats

Unnamed: 0,OddsRatio,P >|z|,2.5%,97.5%
Intercept,0.129056,1.0129080000000002e-17,0.080811,0.206103
age,1.042922,1.773155e-10,1.029543,1.056475


log-odds = 0.129056	 + 1.042922x
p value of age is 1.773155e-10 which is less than 0.05, so we reject the null hypothesis. Also, condidence interval of OR:[1.029543, 1.056475] do not contians 1. age have siginificant effect on wether should have label.

# Multiple Logistic Regression

In [100]:
diabetes_df.dtypes

preg       int64
plas       int64
pres       int64
skin       int64
insu       int64
mass     float64
pedi     float64
age        int64
label      int64
dtype: object

## Module 1

In [123]:
model = sm.GLM.from_formula("label ~ preg + plas + pres", family = sm.families.Binomial(), data = diabetes_df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,768.0
Model:,GLM,Df Residuals:,764.0
Model Family:,Binomial,Df Model:,3.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-392.03
Date:,"Thu, 10 Nov 2022",Deviance:,784.06
Time:,16:12:45,Pearson chi2:,1060.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-5.5245,0.499,-11.077,0.000,-6.502,-4.547
preg,0.1263,0.026,4.888,0.000,0.076,0.177
plas,0.0375,0.003,11.308,0.000,0.031,0.044
pres,-0.0042,0.004,-0.946,0.344,-0.013,0.005


In [124]:
model_stats = pd.DataFrame(np.exp(res.params), columns = ["OddsRatio"])
model_stats["P >|z|"] = res.pvalues
model_stats[["2.5%", "97.5%"]] = np.exp(res.conf_int())
model_stats

Unnamed: 0,OddsRatio,P >|z|,2.5%,97.5%
Intercept,0.003988,1.6299920000000001e-28,0.0015,0.0106
preg,1.134615,1.018466e-06,1.078589,1.193552
plas,1.038228,1.200639e-29,1.031499,1.045001
pres,0.995774,0.3442929,0.987073,1.004552


In [125]:
res.aic

792.062065104257

After fitting different combination of features to the multiple logistic model, Model 1 with features: preg + plas + pres has the AIC as 792.062065104257. None of the variables has a p-value bigger than 0.05.

## Module 2

In [126]:
model = sm.GLM.from_formula("label ~ skin + insu + mass", family = sm.families.Binomial(), data = diabetes_df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,768.0
Model:,GLM,Df Residuals:,764.0
Model Family:,Binomial,Df Model:,3.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-455.39
Date:,"Thu, 10 Nov 2022",Deviance:,910.78
Time:,16:12:50,Pearson chi2:,820.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.7843,0.415,-9.109,0.000,-4.599,-2.970
skin,-0.0141,0.006,-2.382,0.017,-0.026,-0.002
insu,0.0021,0.001,2.808,0.005,0.001,0.004
mass,0.1002,0.013,7.643,0.000,0.074,0.126


In [127]:
model_stats = pd.DataFrame(np.exp(res.params), columns = ["OddsRatio"])
model_stats["P >|z|"] = res.pvalues
model_stats[["2.5%", "97.5%"]] = np.exp(res.conf_int())
model_stats

Unnamed: 0,OddsRatio,P >|z|,2.5%,97.5%
Intercept,0.022725,8.334434e-20,0.010066,0.051302
skin,0.985993,0.01721398,0.974615,0.997503
insu,1.00212,0.004989216,1.00064,1.003603
mass,1.105338,2.117723e-14,1.077313,1.134093


In [128]:
res.aic

918.777966581488

After fitting different combination of features to the multiple logistic model, Model 2 with features: skin + insu + mass has the AIC as 918.777966581488. None of the variables has a p-value bigger than 0.05.

## Module 3

In [129]:
model = sm.GLM.from_formula("label ~ pedi + age", family = sm.families.Binomial(), data = diabetes_df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,768.0
Model:,GLM,Df Residuals:,765.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-464.55
Date:,"Thu, 10 Nov 2022",Deviance:,929.1
Time:,16:12:57,Pearson chi2:,758.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.5717,0.272,-9.465,0.000,-3.104,-2.039
pedi,1.0756,0.237,4.543,0.000,0.612,1.540
age,0.0421,0.007,6.308,0.000,0.029,0.055


In [130]:
model_stats = pd.DataFrame(np.exp(res.params), columns = ["OddsRatio"])
model_stats["P >|z|"] = res.pvalues
model_stats[["2.5%", "97.5%"]] = np.exp(res.conf_int())
model_stats

Unnamed: 0,OddsRatio,P >|z|,2.5%,97.5%
Intercept,0.076408,2.93138e-21,0.044861,0.130139
pedi,2.931847,5.532865e-06,1.843431,4.662895
age,1.042988,2.828964e-10,1.029437,1.056718


In [131]:
res.aic

935.103660096041

After fitting different combination of features to the multiple logistic model, Model 3 with features: pedi + age has the AIC as 935.103660096041. None of the variables has a p-value bigger than 0.05.

## Module 4

In [132]:
model = sm.GLM.from_formula("label ~ preg + plas + pres + skin + insu", family = sm.families.Binomial(), data = diabetes_df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,768.0
Model:,GLM,Df Residuals:,762.0
Model Family:,Binomial,Df Model:,5.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-389.05
Date:,"Thu, 10 Nov 2022",Deviance:,778.09
Time:,16:13:02,Pearson chi2:,1000.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-5.7753,0.520,-11.108,0.000,-6.794,-4.756
preg,0.1308,0.026,4.945,0.000,0.079,0.183
plas,0.0390,0.004,10.947,0.000,0.032,0.046
pres,-0.0066,0.005,-1.426,0.154,-0.016,0.002
skin,0.0153,0.006,2.397,0.017,0.003,0.028
insu,-0.0012,0.001,-1.406,0.160,-0.003,0.000


In [133]:
model_stats = pd.DataFrame(np.exp(res.params), columns = ["OddsRatio"])
model_stats["P >|z|"] = res.pvalues
model_stats[["2.5%", "97.5%"]] = np.exp(res.conf_int())
model_stats

Unnamed: 0,OddsRatio,P >|z|,2.5%,97.5%
Intercept,0.003103,1.149423e-28,0.00112,0.008597
preg,1.139741,7.59594e-07,1.082164,1.200382
plas,1.039797,6.852876000000001e-28,1.032558,1.047088
pres,0.993378,0.1538144,0.984349,1.00249
skin,1.015372,0.01654819,1.002783,1.028119
insu,0.998792,0.1596727,0.997112,1.000476


In [134]:
res.aic

790.090700166048

After fitting different combination of features to the multiple logistic model, Model 4 with features: preg + plas + pres + skin + insu has the AIC as 790.090700166048. None of the variables has a p-value bigger than 0.05.

## Module 5

In [141]:
model = sm.GLM.from_formula("label ~ preg + skin + insu", family = sm.families.Binomial(), data = diabetes_df)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,768.0
Model:,GLM,Df Residuals:,764.0
Model Family:,Binomial,Df Model:,3.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-469.23
Date:,"Thu, 10 Nov 2022",Deviance:,938.45
Time:,16:48:01,Pearson chi2:,772.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.5303,0.172,-8.910,0.000,-1.867,-1.194
preg,0.1483,0.023,6.318,0.000,0.102,0.194
skin,0.0052,0.006,0.943,0.346,-0.006,0.016
insu,0.0024,0.001,3.256,0.001,0.001,0.004


In [142]:
model_stats = pd.DataFrame(np.exp(res.params), columns = ["OddsRatio"])
model_stats["P >|z|"] = res.pvalues
model_stats[["2.5%", "97.5%"]] = np.exp(res.conf_int())
model_stats

Unnamed: 0,OddsRatio,P >|z|,2.5%,97.5%
Intercept,0.216481,5.108384e-19,0.154607,0.303118
preg,1.159847,2.643039e-10,1.107703,1.214444
skin,1.0052,0.3457878,0.99442,1.016095
insu,1.002421,0.00113161,1.000963,1.003881


In [143]:
res.aic

946.4545164725346

After fitting different combination of features to the multiple logistic model, Model 5 with features: preg + skin + insu has the AIC as 946.4545164725346. skin has a p-value bigger than 0.05.

However, we choose module 4 as the best since it has the lowest AIC : 790.090700166048.