In [1]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import plotly
import plotly.graph_objects as go


In [2]:
df = pd.read_csv("today-data/data.csv")
df

# LC: Liver Class, 간암 유무
# HBV: It is to indicate the hepatitis B virus. Hepatitis B is a vaccine-preventable liver infection caused by the hepatitis B virus (HBV). B형 간염
# HCV: A viral infection that causes inflammation of liver that leads to liver inflammation. C형 간염
# drk_st: 음주 습관 유무
# drk_freq: 음주 습관과 상관 없는 음주 횟수 (음주 회식 포함)
# exercise: 1주일간 운동 날짜 횟수
# sex: 남자:1, 여자:2


Unnamed: 0,ID,year,sex,age,drk_st,drk_freq,HBV,HCV,exercise,LC
0,1,2018,1,80,1,2,0,0,0,0
1,2,2018,1,78,1,5,0,0,0,0
2,3,2018,2,75,0,1,0,1,0,0
3,4,2018,2,43,1,4,0,0,3,0
4,5,2018,2,80,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
5973,5974,2018,1,41,1,5,0,0,0,0
5974,5975,2018,2,35,1,4,0,0,0,0
5975,5976,2018,2,66,1,3,0,0,0,0
5976,5977,2018,2,73,0,1,0,0,0,0


In [3]:
scaler = StandardScaler()
df[['age', 'drk_freq']] = scaler.fit_transform(df[['age', 'drk_freq']])
df[['sex','drk_st', 'HBV', 'HCV', 'exercise', 'LC']] = df[['sex','drk_st', 'HBV', 'HCV', 'exercise', 'LC']].astype('category')

df

Unnamed: 0,ID,year,sex,age,drk_st,drk_freq,HBV,HCV,exercise,LC
0,1,2018,1,1.686948,1,-0.609323,0,0,0,0
1,2,2018,1,1.568510,1,1.198385,0,0,0,0
2,3,2018,2,1.390854,0,-1.211892,0,1,0,0
3,4,2018,2,-0.504154,1,0.595816,0,0,3,0
4,5,2018,2,1.686948,0,-1.211892,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
5973,5974,2018,1,-0.622592,1,1.198385,0,0,0,0
5974,5975,2018,2,-0.977906,1,0.595816,0,0,0,0
5975,5976,2018,2,0.857883,1,-0.006753,0,0,0,0
5976,5977,2018,2,1.272416,0,-1.211892,0,0,0,0


In [4]:
df[['sex','drk_st', 'HBV', 'HCV', 'exercise']] = df[['sex','drk_st', 'HBV', 'HCV', 'exercise']]


In [5]:
x = sm.add_constant(df[['sex', 'age', 'drk_st', 'drk_freq', 'HBV', 'HCV', 'exercise']])
y = df[['LC']]
reg_log = sm.Logit(y,x)

In [6]:
# Broyden–Fletcher–Goldfarb–Shanno algorithm
results = reg_log.fit(method='bfgs')

         Current function value: 0.008004
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36




In [7]:
results.summary()

0,1,2,3
Dep. Variable:,LC,No. Observations:,5978.0
Model:,Logit,Df Residuals:,5970.0
Method:,MLE,Df Model:,7.0
Date:,"Thu, 19 Oct 2023",Pseudo R-squ.:,0.2146
Time:,16:11:06,Log-Likelihood:,-47.851
converged:,False,LL-Null:,-60.926
Covariance Type:,nonrobust,LLR p-value:,0.0004735

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.0493,2.253,-1.353,0.176,-7.465,1.366
sex,-3.4641,1.559,-2.222,0.026,-6.520,-0.409
age,2.3418,0.791,2.962,0.003,0.792,3.891
drk_st,-1.7660,1.051,-1.681,0.093,-3.825,0.293
drk_freq,0.3337,0.342,0.977,0.329,-0.336,1.003
HBV,3.6454,0.824,4.422,0.000,2.030,5.261
HCV,-0.0184,9.904,-0.002,0.999,-19.431,19.394
exercise,-1.0668,3.872,-0.276,0.783,-8.656,6.523


### Support Vectors are Sex, Age, and HBV. We can use these to match our patients
## But really?

In [8]:
x = sm.add_constant(df[["sex",	"age",	"HBV"]])
y = df[['LC']]
reg_log = sm.Logit(y,x)

results = reg_log.fit(method='bfgs')
results.summary()

         Current function value: 0.007216
         Iterations: 35
         Function evaluations: 38
         Gradient evaluations: 38




0,1,2,3
Dep. Variable:,LC,No. Observations:,5978.0
Model:,Logit,Df Residuals:,5974.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 19 Oct 2023",Pseudo R-squ.:,0.292
Time:,16:12:12,Log-Likelihood:,-43.137
converged:,False,LL-Null:,-60.926
Covariance Type:,nonrobust,LLR p-value:,9.199e-08

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.9037,2.209,-1.767,0.077,-8.234,0.426
sex,-3.2306,2.006,-1.611,0.107,-7.162,0.701
age,0.6336,0.502,1.262,0.207,-0.350,1.618
HBV,4.8256,0.796,6.059,0.000,3.265,6.387


### HBV. We can use this to match our patients..

In [9]:
x = sm.add_constant(df[["HBV"]])
y = df[['LC']]
reg_log = sm.Logit(y,x)

results = reg_log.fit(method='bfgs')
results.summary()

Optimization terminated successfully.
         Current function value: 0.007203
         Iterations: 31
         Function evaluations: 34
         Gradient evaluations: 34


0,1,2,3
Dep. Variable:,LC,No. Observations:,5978.0
Model:,Logit,Df Residuals:,5976.0
Method:,MLE,Df Model:,1.0
Date:,"Thu, 19 Oct 2023",Pseudo R-squ.:,0.2933
Time:,16:13:06,Log-Likelihood:,-43.057
converged:,True,LL-Null:,-60.926
Covariance Type:,nonrobust,LLR p-value:,2.257e-09

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-7.5904,0.579,-13.118,0.000,-8.724,-6.456
HBV,5.1804,0.743,6.975,0.000,3.725,6.636


### Are they sharing a lot in this dataset? 

In [10]:
df[df.HBV == 1].shape[0] / df[df.HBV == 0].shape[0]

0.010309278350515464