In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 데이터 불러오기
df = pd.read_csv(r'./features.csv', sep='\t', encoding='utf-8')

# 결측값 제거 및 데이터 타입 변환
df = df.dropna().drop(columns=['Unnamed: 0']).reset_index(drop=True)
df['Biological Sex'] = df['Biological Sex'].map({'FEMALE': 0, 'MALE': 1})
df['label'] = df['label'].map({'PersLow': 0, 'PersNorm': 1, 'PersHigh': 2})

# 변수와 라벨 분리 (glucose 제외)
X = df.drop(columns=['datetime', 'label', 'glucose'])
y = df['label']

# LightGBM 파라미터 설정
params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Repeated Stratified K-Fold Cross Validation 설정
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []

for train_index, test_index in rskf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # LightGBM 데이터셋 생성
    train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, free_raw_data=False)
    
    # 모델 훈련
    num_round = 100
    bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
    
    # 예측
    y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
    y_pred_max = np.argmax(y_pred, axis=1)
    
    # 성능 평가
    accuracy = accuracy_score(y_test, y_pred_max) * 100
    precision = precision_score(y_test, y_pred_max, average='weighted') * 100
    recall = recall_score(y_test, y_pred_max, average='weighted') * 100
    f1 = f1_score(y_test, y_pred_max, average='weighted') * 100
    
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# 평균 성능 출력
print(f'Average Accuracy: {np.mean(accuracy_list):.2f}% ± {np.std(accuracy_list):.2f}%')
print(f'Average Precision: {np.mean(precision_list):.2f}% ± {np.std(precision_list):.2f}%')
print(f'Average Recall: {np.mean(recall_list):.2f}% ± {np.std(recall_list):.2f}%')
print(f'Average F1 Score: {np.mean(f1_list):.2f}% ± {np.std(f1_list):.2f}%')

# 최종 모델 훈련 (glucose 제외)
train_data = lgb.Dataset(X, label=y, free_raw_data=False)
bst = lgb.train(params, train_data, num_round)

# 최종 예측
y_pred = bst.predict(X, num_iteration=bst.best_iteration)
y_pred_max = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y, y_pred_max) * 100
precision = precision_score(y, y_pred_max, average='weighted') * 100
recall = recall_score(y, y_pred_max, average='weighted') * 100
f1 = f1_score(y, y_pred_max, average='weighted') * 100
cm = confusion_matrix(y, y_pred_max)

print(f'Final Model Accuracy: {accuracy:.2f}%')
print(f'Final Model Precision: {precision:.2f}%')
print(f'Final Model Recall: {recall:.2f}%')
print(f'Final Model F1 Score: {f1:.2f}%')
print('Confusion Matrix:')
print(cm)
