In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.metrics import r2_score, accuracy_score, classification_report, confusion_matrix

# 1) 데이터 불러오기
df = pd.read_csv('diabetes_dataset.csv')

# 2) 당뇨 단계 → 0~4로 매핑
mapping = {
    'No Diabetes': 0,
    'Pre-Diabetes': 1,
    'Type 1': 2,
    'Type 2': 3,
    'Gestational': 4
}
df['diabetes_stage_class'] = df['diabetes_stage'].map(mapping)

# 3) 필요한 컬럼만 남기고 (원래 diabetes_stage 문자열 컬럼은 버림)
cols = [
    'smoking_status',
    'alcohol_consumption_per_week',
    'physical_activity_minutes_per_week',
    'diet_score',
    'sleep_hours_per_day',
    'screen_time_hours_per_day',
    'age',
    'gender',
    'bmi',
    'diabetes_stage_class'
]
df = df[cols].dropna()

# 4) X, y 분리
X = df.drop('diabetes_stage_class', axis=1)
y = df['diabetes_stage_class']

# 5) 범주형 변수 원핫 인코딩
cat_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# 6) train / test 분리
train_input, test_input, train_target, test_target = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 7) 스케일링 (train으로만 fit!)
scaler = StandardScaler()
scaler.fit(train_input)
train_scaled = scaler.transform(train_input)
test_scaled = scaler.transform(test_input)

In [None]:
# --- Ridge 회귀 ---
ridge = Ridge(alpha=1.0)   # alpha는 규제 강도, 나중에 조정 가능
ridge.fit(train_scaled, train_target)

print("Ridge train R² :", ridge.score(train_scaled, train_target))
print("Ridge test  R² :", ridge.score(test_scaled, test_target))

Ridge train R² : 0.04138095629038985
Ridge test  R² : 0.040821595393797594


In [None]:
# --- 다중 클래스 로지스틱 회귀 ---
log_reg = LogisticRegression(
    multi_class='multinomial',   # 0~4 다중 클래스
    solver='lbfgs',              # 작은/중간 규모 데이터에 적합
    max_iter=1000
)
log_reg.fit(train_scaled, train_target)

# 정확도
print("Logistic train accuracy :", log_reg.score(train_scaled, train_target))
print("Logistic test  accuracy :", log_reg.score(test_scaled, test_target))

# 예측값
test_pred = log_reg.predict(test_scaled)

# 혼동 행렬 & 상세 리포트
print("\nConfusion matrix")
print(confusion_matrix(test_target, test_pred))

print("\nClassification report")
print(classification_report(test_target, test_pred))



Logistic train accuracy : 0.59895
Logistic test  accuracy : 0.5978

Confusion matrix
[[    1   128     0  1467     0]
 [    6   222     0  6141     0]
 [    0     1     0    23     0]
 [    1   221     0 11733     0]
 [    0     3     0    53     0]]

Classification report
              precision    recall  f1-score   support

           0       0.12      0.00      0.00      1596
           1       0.39      0.03      0.06      6369
           2       0.00      0.00      0.00        24
           3       0.60      0.98      0.75     11955
           4       0.00      0.00      0.00        56

    accuracy                           0.60     20000
   macro avg       0.22      0.20      0.16     20000
weighted avg       0.49      0.60      0.47     20000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
