In [20]:
# Import dependencies

import pandas as pd
import numpy as np
from scipy.stats import linregress
from datetime import datetime as dt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
import statsmodels.api as sm
from scipy import stats


In [2]:
# Read the data into a Pandas DataFrame

heart_2020_df = pd.read_csv('Resources/heart_2020_cleaned.csv')
heart_2020_df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [3]:
heart_2020_df = heart_2020_df.rename(columns={"PhysicalHealth": "PhysicalHealthDays", "MentalHealth": "MentalHealthDays", "DiffWalking": "DifficultyWalking", "GenHealth": "GeneralHealth", "SleepTime": "HoursOfSleep", "PhysicalActivity": "PhysicalActivities"}
                                 )
heart_2020_df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealthDays,MentalHealthDays,DifficultyWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivities,GeneralHealth,HoursOfSleep,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [4]:
heart_2020_cleaned = heart_2020_df[heart_2020_df["Diabetic"].str.contains("No, borderline diabetes|Yes, during pregnancy") == False]
heart_2020_cleaned

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealthDays,MentalHealthDays,DifficultyWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivities,GeneralHealth,HoursOfSleep,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [5]:
heart_2020_cleaned.to_csv("Resources/2020_cleaned.csv", encoding='utf8', index=False)

In [6]:
dummies = pd.get_dummies(heart_2020_cleaned)
dummies.head()

Unnamed: 0,BMI,PhysicalHealthDays,MentalHealthDays,HoursOfSleep,HeartDisease_No,HeartDisease_Yes,Smoking_No,Smoking_Yes,AlcoholDrinking_No,AlcoholDrinking_Yes,...,GeneralHealth_Fair,GeneralHealth_Good,GeneralHealth_Poor,GeneralHealth_Very good,Asthma_No,Asthma_Yes,KidneyDisease_No,KidneyDisease_Yes,SkinCancer_No,SkinCancer_Yes
0,16.6,3.0,30.0,5.0,1,0,0,1,1,0,...,0,0,0,1,0,1,1,0,0,1
1,20.34,0.0,0.0,7.0,1,0,1,0,1,0,...,0,0,0,1,1,0,1,0,1,0
2,26.58,20.0,30.0,8.0,1,0,0,1,1,0,...,1,0,0,0,0,1,1,0,1,0
3,24.21,0.0,0.0,6.0,1,0,1,0,1,0,...,0,1,0,0,1,0,1,0,0,1
4,23.71,28.0,0.0,8.0,1,0,1,0,1,0,...,0,0,0,1,1,0,1,0,1,0


In [7]:
x=dummies["HoursOfSleep"]
x = [[item] for item in x]
y=dummies["HeartDisease_Yes"]

x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=123, stratify=y)


logreg = LogisticRegression().fit(x_train,y_train)
logreg

print("Training set score: {:.3f}".format(logreg.score(x_train,y_train)))
print("Test set score: {:.3f}".format(logreg.score(x_test,y_test)))

logit_model=sm.Logit(y,x)
result=logit_model.fit()
print(result.summary())


Training set score: 0.915
Test set score: 0.915


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


Optimization terminated successfully.
         Current function value: inf
         Iterations 6




                           Logit Regression Results                           
Dep. Variable:       HeartDisease_Yes   No. Observations:               310455
Model:                          Logit   Df Residuals:                   310454
Method:                           MLE   Df Model:                            0
Date:                Wed, 13 Dec 2023   Pseudo R-squ.:                     inf
Time:                        16:54:30   Log-Likelihood:                   -inf
converged:                       True   LL-Null:                        0.0000
Covariance Type:            nonrobust   LLR p-value:                       nan
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -0.3319      0.001   -359.697      0.000      -0.334      -0.330




In [25]:
x = dummies["MentalHealthDays"]
x = [[item] for item in x]
y = dummies['HeartDisease_Yes']
 
# with sklearn
regr = linear_model.LinearRegression()
regr.fit(x, y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

# with statsmodels
x = sm.add_constant(x) # adding a constant
 
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
 
print_model = model.summary()
print(print_model)

Intercept: 
 0.08142757870477409
Coefficients: 
 [0.00099537]
                            OLS Regression Results                            
Dep. Variable:       HeartDisease_Yes   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     247.7
Date:                Wed, 13 Dec 2023   Prob (F-statistic):           8.69e-56
Time:                        17:18:41   Log-Likelihood:                -44417.
No. Observations:              310455   AIC:                         8.884e+04
Df Residuals:                  310453   BIC:                         8.886e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------