In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [5]:
# 1) 데이터 불러오기
df = pd.read_csv('diabetes_dataset.csv')

# Filter for Asian data
asian_df = df[df['ethnicity'] == 'Asian']
non_asian_df = df[df['ethnicity'] != 'Asian']

# 2) diabetes_stage → 0~4로 세분화
mapping = {
    'No Diabetes': 0,
    'Pre-Diabetes': 1,
    'Type 1': 2,
    'Type 2': 3,
    'Gestational': 4
}
df['diabetes_stage_class'] = df['diabetes_stage'].map(mapping)

# 3) 필요한 컬럼만 선택
cols = [
    'smoking_status',
    'alcohol_consumption_per_week',
    'physical_activity_minutes_per_week',
    'diet_score',
    'sleep_hours_per_day',
    'screen_time_hours_per_day',
    'age',
    'gender',
    'bmi',
    'diabetes_stage_class'
]

df = df[cols].dropna()

# 4) X, y 분리
X = df.drop('diabetes_stage_class', axis=1)
y = df['diabetes_stage_class']

# 5) 문자형 변수 자동 인코딩
cat_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# 6) 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# 7) Statsmodels 다항 로지스틱 회귀 MNLogit (p-value, OR 포함)
X_sm = sm.add_constant(X_scaled)
model = sm.MNLogit(y, X_sm).fit()

print("\n===== MNLogit 결과 (p-value 포함) =====")
print(model.summary())

         Current function value: 0.870660
         Iterations: 35

===== MNLogit 결과 (p-value 포함) =====




                           MNLogit Regression Results                           
Dep. Variable:     diabetes_stage_class   No. Observations:               100000
Model:                          MNLogit   Df Residuals:                    99952
Method:                             MLE   Df Model:                           44
Date:                  Tue, 25 Nov 2025   Pseudo R-squ.:                 0.03078
Time:                          11:27:52   Log-Likelihood:                -87066.
converged:                        False   LL-Null:                       -89831.
Covariance Type:              nonrobust   LLR p-value:                     0.000
diabetes_stage_class=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      1.5162      0.014    108.912      0.000       1.489       1.543
x1                         0.0073      0.013      0.581      0.561      -0.017 

In [7]:
# OR 및 CI 계산
params = model.params
conf = model.conf_int()
OR = np.exp(params)

# CI 계산
CI_lower = np.exp(conf.iloc[:, 0])
CI_upper = np.exp(conf.iloc[:, 1])

# DataFrame 생성
result_df = pd.DataFrame({
    'Coefficient': [params],
    'OR': [OR],
    '95% CI Lower': [CI_lower],
    '95% CI Upper': [CI_upper]
})

print("\n===== Odds Ratio (OR) & 95% CI =====")
print(result_df)


===== Odds Ratio (OR) & 95% CI =====
                                         Coefficient  \
0                0         1         2          3...   

                                                  OR  \
0                0         1         2           ...   

                                        95% CI Lower  \
0  diabetes_stage_class       
1                 ...   

                                        95% CI Upper  
0  diabetes_stage_class       
1                 ...  


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
# 8) Train/Test Split → sklearn 평가
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

clf = LogisticRegression(multi_class='multinomial', max_iter=500)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)



In [9]:
# 9) 모델 성능 평가
print("\n===== Confusion Matrix =====")
print(confusion_matrix(y_test, y_pred))

print("\n===== Accuracy =====")
print(accuracy_score(y_test, y_pred))

print("\n===== Classification Report (Precision/Recall/F1 per class) =====")
print(classification_report(y_test, y_pred))


===== Confusion Matrix =====
[[    1   116     0  1444     0]
 [    1   294     0  6184     0]
 [    0     2     0    22     0]
 [    3   227     0 11649     0]
 [    0     6     0    51     0]]

===== Accuracy =====
0.5972

===== Classification Report (Precision/Recall/F1 per class) =====
              precision    recall  f1-score   support

           0       0.20      0.00      0.00      1561
           1       0.46      0.05      0.08      6479
           2       0.00      0.00      0.00        24
           3       0.60      0.98      0.75     11879
           4       0.00      0.00      0.00        57

    accuracy                           0.60     20000
   macro avg       0.25      0.21      0.17     20000
weighted avg       0.52      0.60      0.47     20000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
