In [13]:
import pandas as pd
import statsmodels.formula.api as smf
from math import exp

# load data
heart = pd.read_csv('Heart.csv')
heart.head()

Unnamed: 0,HeartDisease,Age,Male,ChestPainType,BloodPressure,Cholesterol,BloodSugar,EEG,MaxHR,Angina,OldPeak,PeakST,Flourosopy,Thal
0,1,70,1,4,130,322,0,2,109,0,2.4,2,3,3
1,0,67,0,3,115,564,0,2,160,0,1.6,2,0,7
2,1,57,1,2,124,261,0,0,141,0,0.3,1,0,7
3,0,64,1,4,128,263,0,0,105,1,0.2,2,1,7
4,0,74,0,2,120,269,0,2,121,1,0.2,1,1,3


In [7]:
# craft logitic regression model to predict heart disease based on being male
# spefication for the model
heart_model_1 = smf.logit(formula = 'HeartDisease ~ Male', data = heart)

# train the model from data
# this is the actual model > fit > estimate the coefficient in the data
model_1_results = heart_model_1.fit()

Optimization terminated successfully.
         Current function value: 0.640593
         Iterations 5


In [8]:
# what are the model results?
# present findings
print(model_1_results.summary())

# first model
# LL-Null = baseline model
# Log-likelihood = our model (closer to zero > Better! than baseline model)
# simple logistic regression > use only one feature (indepedent variable)

                           Logit Regression Results                           
Dep. Variable:           HeartDisease   No. Observations:                  270
Model:                          Logit   Df Residuals:                      268
Method:                           MLE   Df Model:                            1
Date:                Sun, 14 Jul 2024   Pseudo R-squ.:                 0.06750
Time:                        22:42:05   Log-Likelihood:                -172.96
converged:                       True   LL-Null:                       -185.48
Covariance Type:            nonrobust   LLR p-value:                 5.618e-07
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -1.2090      0.255     -4.745      0.000      -1.708      -0.710
Male           1.3953      0.295      4.731      0.000       0.817       1.973


In [9]:
# Second model
# craft logistic regression to prodict heart disease using male and age
heart_model_2 = smf.logit(formula= 'HeartDisease ~ Male + Age', data = heart)

# train the model from data
# this is the actual model > fit > estimate the coefficient in the data
model_2_results = heart_model_2.fit()

Optimization terminated successfully.
         Current function value: 0.607039
         Iterations 5


In [10]:
# what are the model results?
# present findings
print(model_2_results.summary())

# Log-likelihood = our model (closer to zero > Better! than baseline model and 1st model)
# Df Model = 2 (2 features)

                           Logit Regression Results                           
Dep. Variable:           HeartDisease   No. Observations:                  270
Model:                          Logit   Df Residuals:                      267
Method:                           MLE   Df Model:                            2
Date:                Sun, 14 Jul 2024   Pseudo R-squ.:                  0.1163
Time:                        22:55:46   Log-Likelihood:                -163.90
converged:                       True   LL-Null:                       -185.48
Covariance Type:            nonrobust   LLR p-value:                 4.249e-10
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -4.8637      0.959     -5.071      0.000      -6.744      -2.984
Male           1.6222      0.315      5.156      0.000       1.006       2.239
Age            0.0639      0.016      4.060      0.0

In [11]:
# Third model
# craft logistic regression to prodict heart disease using male, age, and Angina
heart_model_3 = smf.logit(formula= 'HeartDisease ~ Male + Age + Angina', data = heart)

# train the model from data
# this is the actual model > fit > estimate the coefficient in the data
model_3_results = heart_model_3.fit()

Optimization terminated successfully.
         Current function value: 0.538839
         Iterations 6


In [12]:
# what are the model results?
# present findings
print(model_3_results.summary())
# Log-likelihood = our model (closer to zero > Better! than baseline model, 1st, and 2nd model)

                           Logit Regression Results                           
Dep. Variable:           HeartDisease   No. Observations:                  270
Model:                          Logit   Df Residuals:                      266
Method:                           MLE   Df Model:                            3
Date:                Sun, 14 Jul 2024   Pseudo R-squ.:                  0.2156
Time:                        23:00:01   Log-Likelihood:                -145.49
converged:                       True   LL-Null:                       -185.48
Covariance Type:            nonrobust   LLR p-value:                 3.090e-17
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -5.2011      1.030     -5.051      0.000      -7.219      -3.183
Male           1.4648      0.333      4.404      0.000       0.813       2.117
Age            0.0614      0.017      3.643      0.0

In [14]:
# interpreting coefficients
# coefficient calcualtion to odds ratios
print(exp(model_3_results.params['Male']))

4.326528261966835


In [16]:
# get the odds ratio for the age coefficient
# 1 = for one year of age; .0633 = 6.3% likelihood
print(exp(model_3_results.params['Age']))

1.0633171823751426


In [15]:
# get the odds ratio for the Angina coefficient
print(exp(model_3_results.params['Angina']))

6.020938842080951


## Interpreting 3rd model

The model estimates that:
    > Males are 4.33 times more likely to have heart disease
    > Each year of age raises the relative risk of heart disease by 6.3%
    > Patients with angina are 6 times more likely to have heart disease