# Logistic Regression

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import numpy as np

In [2]:
# using NHANES data
da = pd.read_csv("nhanes_2015_2016.csv")

# 실습에 사용할 columns만 추출
# BPXSY1: systolic blood pressure(혈압), RIDAGEYR: 나이, RIAGENDR: 성별(1:남성, 2:여성)
# DMDEDUC2: 교육 수준
# SMQ020: 흡연 유무(1: 흡연, 2: 비흡연, 7: 모름, 9: 답변 거부)
vars = ["BPXSY1", "RIDAGEYR", "RIAGENDR", "DMDEDUC2", "BMXBMI", "SMQ020"]
da = da[vars].dropna()
da.head()

Unnamed: 0,BPXSY1,RIDAGEYR,RIAGENDR,DMDEDUC2,BMXBMI,SMQ020
0,128.0,62,1,5.0,27.8,1
1,146.0,53,1,3.0,30.8,1
2,138.0,78,1,3.0,28.8,1
3,132.0,56,2,5.0,42.4,2
4,100.0,42,2,4.0,20.3,2


## logistic regression을 통해   
## 독립변수들(나이, 성별, 교육 수준 등)이 종속변수(흡연 유무)에 미치는 영향 확인

In [3]:
da["RIAGENDRx"] = da.RIAGENDR.replace({1: "Male", 2: "Female"})

In [4]:
# SMQ020: 7, 9(모름, 답변거부) = missing value
da["smq"] = da.SMQ020.replace({2: 0, 7: np.nan, 9: np.nan})

# smq 1: 흡연, 0: 비흡연

## 1) odds and log odds
   
Linear regression과 달리 logistic regression은 종속변수가 binary value이다.   
따라서 y가 1이 될 확률 p와, 0이 될 확률 1-p로 

odds = p/(1-p)

In [5]:
c = pd.crosstab(da.RIAGENDRx, da.smq).apply(lambda x: x/x.sum(), axis=1)
c["odds"] = c.loc[:, 1] / c.loc[:, 0]
c

smq,0.0,1.0,odds
RIAGENDRx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,0.680197,0.319803,0.470162
Male,0.467453,0.532547,1.139252


In [6]:
c["logodds"] = np.log(c.odds)
c

smq,0.0,1.0,odds,logodds
RIAGENDRx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,0.680197,0.319803,0.470162,-0.754679
Male,0.467453,0.532547,1.139252,0.130371


## 2) logistic regression

'RIAGENDR' : 성별(1:남성, 2:여성) 변수   
성별 변수는 categorical 변수이다.

In [7]:
model = sm.GLM.from_formula("smq ~ RIAGENDRx", family=sm.families.Binomial(), data=da)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,smq,No. Observations:,5094.0
Model:,GLM,Df Residuals:,5092.0
Model Family:,Binomial,Df Model:,1.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-3350.6
Date:,"Wed, 15 Dec 2021",Deviance:,6701.2
Time:,20:26:33,Pearson chi2:,5090.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.7547,0.042,-18.071,0.000,-0.837,-0.673
RIAGENDRx[T.Male],0.8851,0.058,15.227,0.000,0.771,0.999


In [8]:
c.logodds.Male - c.logodds.Female

0.8850500036644218

## 해석

DV(종속변수): smq    /    IV(독립변수) : RIAGENDRx   
여성일때보다 남성일때 흡연율의 로그오즈가 0.8851 높다.   
여성의 흡연율 로그오즈와 남성의 흡연율 로그오즈 ratio가 0.8851이다.   
p값은 거의 0이기 때문에 유의수준 0.05에서 유의하다.   


# 

## 3) Adding covariates   
   
RIDAGEYR : 나이 변수 추가

In [9]:
da["RIAGENDRx"] = da.RIAGENDR.replace({1: "Male", 2: "Female"})

In [10]:
model = sm.GLM.from_formula("smq ~ RIDAGEYR + RIAGENDRx", family=sm.families.Binomial(), data=da)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,smq,No. Observations:,5094.0
Model:,GLM,Df Residuals:,5091.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-3296.6
Date:,"Wed, 15 Dec 2021",Deviance:,6593.2
Time:,20:26:33,Pearson chi2:,5100.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.6166,0.095,-16.985,0.000,-1.803,-1.430
RIAGENDRx[T.Male],0.8920,0.059,15.170,0.000,0.777,1.007
RIDAGEYR,0.0172,0.002,10.289,0.000,0.014,0.021


## 해석

1. 같은 성별일 때, age가 1 높으면 흡연율의 로그오즈가 평균적으로 0.0172 더 높다.
2. 같은 나이일 때, 남성일 때 흡연율의 로그오즈가 평균적으로 0.8920 더 높다.
    
   
하나의 회귀 계수를 해석할 때, 다른 변수는 모두 동일하다.   
age 회귀 계수를 해석할 때, 같은 성별 기준으로 해석한다.   
성별 회귀 계수를 해석할 때, 같은 나이 기준으로 해석한다.


# 