In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, r2_score

# 1) 데이터 불러오기
df = pd.read_csv('diabetes_dataset.csv')

# Filter for Asian data
asian_df = df[df['ethnicity'] == 'Asian']
non_asian_df = df[df['ethnicity'] != 'Asian']

# 2) diabetes_stage → 0~4로 세분화
mapping = {
    'No Diabetes': 0,
    'Pre-Diabetes': 1,
    'Type 1': 2,
    'Type 2': 3,
    'Gestational': 4
}
df['diabetes_stage_class'] = df['diabetes_stage'].map(mapping)

# 3) 필요한 컬럼만 선택
cols = [
    'smoking_status',
    'alcohol_consumption_per_week',
    'physical_activity_minutes_per_week',
    'diet_score',
    'sleep_hours_per_day',
    'screen_time_hours_per_day',
    'age',
    'gender',
    'bmi',
    'diabetes_stage_class'
]

# 4) X, y 분리
X = df.drop('diabetes_stage_class', axis=1)
y = df['diabetes_stage_class']

# 5) 문자형 변수 자동 인코딩
cat_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# 1) 훈련/테스트 분리
train_input, test_input, train_target, test_target = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 2) 스케일링 (train으로만 fit!)
scaler = StandardScaler()
scaler.fit(train_input)
train_scaled = scaler.transform(train_input)
test_scaled = scaler.transform(test_input)

# 3) Ridge 모델
ridge = Ridge(alpha=1.0)
ridge.fit(train_scaled, train_target)

print("Ridge train R2 =", ridge.score(train_scaled, train_target))
print("Ridge test R2 =", ridge.score(test_scaled, test_target))

# 4) Lasso 모델
lasso = Lasso(alpha=0.01, max_iter=10000)
lasso.fit(train_scaled, train_target)

print("Lasso train R2 =", lasso.score(train_scaled, train_target))
print("Lasso test R2 =", lasso.score(test_scaled, test_target))

Ridge train R2 = 0.9999999100037358
Ridge test R2 = 0.9999999025114826
Lasso train R2 = 0.9943756408208055
Lasso test R2 = 0.9939827223911731
