In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
from sklearn.metrics import roc_curve
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')
plt.rcParams['figure.figsize']=(6,3)
plt.rcParams['figure.dpi']=200

In [2]:
df=pd.read_csv('southafricanheart.csv')
df=df.drop("Unnamed: 0", axis =1)

In [3]:
df

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,160,12.00,5.73,23.11,Present,49,25.30,97.20,52,1
1,144,0.01,4.41,28.61,Absent,55,28.87,2.06,63,1
2,118,0.08,3.48,32.28,Present,52,29.14,3.81,46,0
3,170,7.50,6.41,38.03,Present,51,31.99,24.26,58,1
4,134,13.60,3.50,27.78,Present,60,25.99,57.34,49,1
5,132,6.20,6.47,36.21,Present,62,30.77,14.14,45,0
6,142,4.05,3.38,16.20,Absent,59,20.81,2.62,38,0
7,114,4.08,4.59,14.60,Present,62,23.11,6.72,58,1
8,114,0.00,3.83,19.40,Present,49,24.86,2.49,29,0
9,132,0.00,5.80,30.96,Present,69,30.11,0.00,53,1


In [4]:
df['famhist'].value_counts()


Absent     270
Present    192
Name: famhist, dtype: int64

In [5]:
df['famhist']=df['famhist'].replace(['Present','Absent'],[1,0])

In [6]:
m1_logit = smf.logit('chd ~ famhist', df).fit()

Optimization terminated successfully.
         Current function value: 0.608111
         Iterations 5


In [7]:
print(m1_logit.summary2())

                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.057     
Dependent Variable: chd              AIC:              565.8944  
Date:               2018-12-14 16:05 BIC:              574.1655  
No. Observations:   462              Log-Likelihood:   -280.95   
Df Model:           1                LL-Null:          -298.05   
Df Residuals:       460              LLR p-value:      4.9371e-09
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     5.0000                                       
------------------------------------------------------------------
               Coef.   Std.Err.     z     P>|z|    [0.025   0.975]
------------------------------------------------------------------
Intercept     -1.1690    0.1431  -8.1687  0.0000  -1.4495  -0.8885
famhist        1.1690    0.2033   5.7514  0.0000   0.7706   1.5674



In [8]:
estimate_y1=m1_logit.params['Intercept']+(m1_logit.params['famhist']*1)
estimate_y0=m1_logit.params['Intercept']+(m1_logit.params['famhist']*0)
print("El log odds estimado es de ", round(estimate_y1, 2))
print("El log odds estimado es de ", round(estimate_y0, 2))

El log odds estimado es de  -0.0
El log odds estimado es de  -1.17


In [9]:
def invlogit(x):
    return 1 / (1+np.exp(-x))



print(invlogit(estimate_y1)-invlogit(estimate_y0))

0.2629629629629631


In [10]:
#La probabilidad de un individuo con antecedentes familiares de tener una enfermedad coronaria es de 0,5
#La probabilidad de un individuo sin antecedentes familiares de tener una enfermedad coronaria es de 0,24 
#La diferencia en la probabilidad entre un individuo con antecedentes y otro sin antecedentes es de 0,26

In [11]:
m1_ols = smf.ols('chd ~ famhist', df).fit()

In [12]:
def concise_summary(mod,print_fit=True):
    fit = pd.DataFrame({'Statistics': mod.summary2().tables[0][2][2:],
                       'Value': mod.summary2().tables[0][3][28:]})
    estimates = pd.DataFrame(mod.summary2().tables[1].loc[:, 'Coef.'])
    if print_fit is True:
        print("\nGoodness of Fit statistics\n", fit)
        print("\nPoint Estimates\n\n", estimates)

        

In [13]:
concise_summary(m1_ols)


Goodness of Fit statistics
             Statistics Value
2                 BIC:   NaN
3      Log-Likelihood:   NaN
4         F-statistic:   NaN
5  Prob (F-statistic):   NaN
6               Scale:   NaN

Point Estimates

               Coef.
Intercept  0.237037
famhist    0.262963


In [14]:
m2_logit = smf.logit('chd ~ sbp + tobacco + ldl + adiposity + typea + obesity + alcohol + age + famhist', df).fit()

Optimization terminated successfully.
         Current function value: 0.510974
         Iterations 6


In [15]:
m2_logit.summary2()

0,1,2,3
Model:,Logit,Pseudo R-squared:,0.208
Dependent Variable:,chd,AIC:,492.14
Date:,2018-12-14 16:05,BIC:,533.4957
No. Observations:,462,Log-Likelihood:,-236.07
Df Model:,9,LL-Null:,-298.05
Df Residuals:,452,LLR p-value:,2.0548e-22
Converged:,1.0000,Scale:,1.0
No. Iterations:,6.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-6.1507,1.3083,-4.7015,0.0000,-8.7149,-3.5866
sbp,0.0065,0.0057,1.1350,0.2564,-0.0047,0.0177
tobacco,0.0794,0.0266,2.9838,0.0028,0.0272,0.1315
ldl,0.1739,0.0597,2.9152,0.0036,0.0570,0.2909
adiposity,0.0186,0.0293,0.6346,0.5257,-0.0388,0.0760
typea,0.0396,0.0123,3.2138,0.0013,0.0154,0.0637
obesity,-0.0629,0.0442,-1.4218,0.1551,-0.1496,0.0238
alcohol,0.0001,0.0045,0.0271,0.9784,-0.0087,0.0089
age,0.0452,0.0121,3.7285,0.0002,0.0215,0.0690


In [16]:
tmp=df
tmp=tmp.drop(['sbp','adiposity','obesity','alcohol'],1)
tmp

Unnamed: 0,tobacco,ldl,famhist,typea,age,chd
0,12.00,5.73,1,49,52,1
1,0.01,4.41,0,55,63,1
2,0.08,3.48,1,52,46,0
3,7.50,6.41,1,51,58,1
4,13.60,3.50,1,60,49,1
5,6.20,6.47,1,62,45,0
6,4.05,3.38,0,59,38,0
7,4.08,4.59,1,62,58,1
8,0.00,3.83,1,49,29,0
9,0.00,5.80,1,69,53,1


In [17]:
m3_logit = smf.logit('chd ~ tobacco+ldl+famhist+typea+age', tmp).fit()

Optimization terminated successfully.
         Current function value: 0.514811
         Iterations 6


In [18]:
m3_logit.summary2()

0,1,2,3
Model:,Logit,Pseudo R-squared:,0.202
Dependent Variable:,chd,AIC:,487.6856
Date:,2018-12-14 16:05,BIC:,512.499
No. Observations:,462,Log-Likelihood:,-237.84
Df Model:,5,LL-Null:,-298.05
Df Residuals:,456,LLR p-value:,2.5537000000000002e-24
Converged:,1.0000,Scale:,1.0
No. Iterations:,6.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-6.4464,0.9209,-7.0004,0.0000,-8.2513,-4.6416
tobacco,0.0804,0.0259,3.1057,0.0019,0.0297,0.1311
ldl,0.1620,0.0550,2.9470,0.0032,0.0543,0.2697
famhist,0.9082,0.2258,4.0228,0.0001,0.4657,1.3507
typea,0.0371,0.0122,3.0505,0.0023,0.0133,0.0610
age,0.0505,0.0102,4.9442,0.0000,0.0305,0.0705


Las variables determinantes son: tobacco, ldl, typea, age,famhist'

In [54]:
ldl_alto=0
ldl_bajo=0
for i in df['ldl']:
    if i >=6:
        ldl_alto+=1
    else:
        ldl_bajo+=1
        

In [55]:
ldl_alto

101

In [56]:
ldl_bajo

361

In [57]:
estimat_y=m3_logit.params['Intercept']+(m3_logit.params['ldl']*ldl_alto)

In [58]:
invlogit(estimat_y)

0.9999505604642188

In [59]:
estimat_y1=m3_logit.params['Intercept']+(m3_logit.params['ldl']*ldl_bajo)

In [53]:
invlogit(estimat_y1)

1.0

Las probabilidades de tener una enfemerdad coronaria para un individuo con bajos niveles de lipoproteína de baja densidad, manteniendo todas las demás características constantes, es practicamente la misma que uno que tiene los niveles altos.