In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 데이터 불러오기
df = pd.read_csv(r'./features.csv', sep='\t', encoding='utf-8')

# 결측값 제거 및 데이터 타입 변환
df = df.dropna().drop(columns=['Unnamed: 0']).reset_index(drop=True)
df['Biological Sex'] = df['Biological Sex'].map({'FEMALE': 0, 'MALE': 1})
df['label'] = df['label'].map({'PersLow': 0, 'PersNorm': 1, 'PersHigh': 2})

# 훈련 및 테스트 데이터 분할
X = df.drop(columns=['datetime', 'label'])
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# LightGBM 데이터셋 생성
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# LightGBM 파라미터 설정
params = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# 모델 훈련
num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=10)

# 예측
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
y_pred_max = np.argmax(y_pred, axis=1)

# 성능 평가
accuracy = accuracy_score(y_test, y_pred_max)
precision = precision_score(y_test, y_pred_max, average='weighted')
recall = recall_score(y_test, y_pred_max, average='weighted')
f1 = f1_score(y_test, y_pred_max, average='weighted')

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# 혼동 행렬
cm = confusion_matrix(y_test, y_pred_max)
print('Confusion Matrix:')
print(cm)

# 모델 저장
bst.save_model('lightgbm_model.txt')

# 새로운 데이터 준비
X_new = X_test.drop(columns=['HbA1c', 'glucose'])

# 저장된 모델 로드
bst = lgb.Booster(model_file='lightgbm_model.txt')

# 예측
y_pred_new = bst.predict(X_new)
y_pred_new_max = np.argmax(y_pred_new, axis=1)

# 결과 출력
results = pd.DataFrame({'True Label': y_test, 'Predicted Label': y_pred_new_max})
print(results.head())

# 새로운 데이터에 대한 성능 평가
accuracy_new = accuracy_score(y_test, y_pred_new_max)
precision_new = precision_score(y_test, y_pred_new_max, average='weighted')
recall_new = recall_score(y_test, y_pred_new_max, average='weighted')
f1_new = f1_score(y_test, y_pred_new_max, average='weighted')

print(f'New Data Accuracy: {accuracy_new:.4f}')
print(f'New Data Precision: {precision_new:.4f}')
print(f'New Data Recall: {recall_new:.4f}')
print(f'New Data F1 Score: {f1_new:.4f}')
