In [None]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from lightgbm import LGBMClassifier
from pycaret.classification import * #3.3.0
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import TfidfVectorizer #1.4.2
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report
from imblearn.over_sampling import SMOTE

# XGBoost 모델 학습
xgb_clf = XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train, y_train)

# 테스트 데이터로 예측 수행
y_pred = xgb_clf.predict(X_test)

# 결과 평가
test_accuracy = accuracy_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred, average='macro')
print(f"Test Accuracy: {test_accuracy:.4f}, Test F1 Score: {test_f1:.4f}")
print(classification_report(y_test, y_pred))

# 트리 개수, 트리 깊이, 학습률 변경
xgb_clf = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=300, # 기본값 100 (트리 개수)
    max_depth=5,            # 기본값 3 (트리 깊이)
    learning_rate=0.05, # 기본값 0.1
    subsample=1.0, # 전체 데이터 사용
    colsample_bytree=1.0 # 전체 feature 사용
)

xgb_clf.fit(X_train, y_train)

# 훈련, 테스트 데이터로 예측
y_train_pred = xgb_clf.predict(X_train)
y_test_pred = xgb_clf.predict(X_test)

# 성능 평가 (훈련 데이터, 테스트 데이터)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred, average='macro')
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='macro')
# report = classification_report(y_test, y_pred) #정밀도, 재현율, F1-점수를 포함한 성능 지표

print("max_depth 튜닝 결과")
print(f"Train Accuracy= {train_accuracy:.4f}, Train F1 Score: {train_f1:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}, Test F1 Score: {test_f1:.4f}")
print(classification_report(y_test, y_test_pred))

import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import learning_curve

# 학습 곡선 생성 (X와 y는 이전에 TF-IDF로 변환한 데이터와 타겟)
train_sizes, train_scores, valid_scores = learning_curve(
    xgb_clf, X, y, train_sizes=[0.1, 0.33, 0.55, 0.78, 1.0], cv=5, scoring='accuracy')

# 학습 및 검증 정확도의 평균 계산
train_mean = np.mean(train_scores, axis=1)
valid_mean = np.mean(valid_scores, axis=1)

# 학습 곡선 시각화
plt.plot(train_sizes, train_mean, label='Training Accuracy', marker='o')
plt.plot(train_sizes, valid_mean, label='Validation Accuracy', marker='o')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.title('Learning Curve')
plt.legend()
plt.grid(True)
plt.show()

