In [1]:
pip install pandas scikit-learn lightgbm xgboost


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   --- ------------------------------------ 0.1/1.5 MB 3.6 MB/s eta 0:00:01
   ----------- ---------------------------- 0.4/1.5 MB 5.3 MB/s eta 0:00:01
   -------------------- ------------------- 0.7/1.5 MB 5.9 MB/s eta 0:00:01
   ------------------------------ --------- 1.1/1.5 MB 6.4 MB/s eta 0:00:01
   -------------------------------------- - 1.4/1.5 MB 6.4 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 6.2 MB/s eta 0:00:00
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/150.0 MB 2.6 MB/s eta 0:00:57
   --------------------------

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb

# CSV 파일 불러오기
df = pd.read_csv("comb_Z_score.csv")

# Feature와 Target 분리
X = df.drop(columns=["fake"])
y = df["fake"]

# 학습용/테스트용 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 사용할 모델 정의
models = {
    "LightGBM": lgb.LGBMClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "Random Forest": RandomForestClassifier(),
}

# 결과 저장용 딕셔너리
results = {}

# 각 모델 학습 및 평가
for name, model in models.items():
    print(f"\n===== {name} =====")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    # 성능 지표 출력
    accuracy = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds, target_names=["Real", "Fake"])

    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:\n", report)

    # 결과 저장
    results[name] = {
        "Accuracy": accuracy,
        "Classification Report": report
    }

# 원하면 이 `results` 딕셔너리를 파일로 저장하거나 시각화할 수 있음



===== LightGBM =====
[LightGBM] [Info] Number of positive: 281, number of negative: 275
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000124 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 595
[LightGBM] [Info] Number of data points in the train set: 556, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505396 -> initscore=0.021584
[LightGBM] [Info] Start training from score 0.021584
Accuracy: 0.9071
Classification Report:
               precision    recall  f1-score   support

        Real       0.89      0.93      0.91        73
        Fake       0.92      0.88      0.90        67

    accuracy                           0.91       140
   macro avg       0.91      0.91      0.91       140
weighted avg       0.91      0.91      0.91       140


===== Gradient Boosting =====
Accuracy: 0.9071
Classificatio

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.9286
Classification Report:
               precision    recall  f1-score   support

        Real       0.91      0.96      0.93        73
        Fake       0.95      0.90      0.92        67

    accuracy                           0.93       140
   macro avg       0.93      0.93      0.93       140
weighted avg       0.93      0.93      0.93       140



In [29]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# 데이터 불러오기
df = pd.read_csv("comb_Z_score.csv")
X = df.drop(columns=["fake"])
y = df["fake"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 개별 모델 정의
model1 = GradientBoostingClassifier()
model2 = RandomForestClassifier()
model3 = lgb.LGBMClassifier()
model4 = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# 소프트 보팅 앙상블
voting_clf = VotingClassifier(
    estimators=[
        ('gb', model1),
        ('rf', model2),
        ('lgbm', model3),
        ('xgb', model4)
    ],
    voting='soft'
)

# 학습 및 평가
voting_clf.fit(X_train, y_train)
preds = voting_clf.predict(X_test)

# 결과 출력
print("Voting Classifier Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds, target_names=["Real", "Fake"]))


[LightGBM] [Info] Number of positive: 281, number of negative: 275
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 595
[LightGBM] [Info] Number of data points in the train set: 556, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505396 -> initscore=0.021584
[LightGBM] [Info] Start training from score 0.021584
Voting Classifier Accuracy: 0.9071428571428571
              precision    recall  f1-score   support

        Real       0.88      0.95      0.91        73
        Fake       0.94      0.87      0.90        67

    accuracy                           0.91       140
   macro avg       0.91      0.91      0.91       140
weighted avg       0.91      0.91      0.91       140



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [33]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimators = [
    ('rf', model2),
    ('lgbm', model3),
    ('xgb', model4),
    ('gb', model1)
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5
)

stacking_clf.fit(X_train, y_train)
print("Stacking Accuracy:", stacking_clf.score(X_test, y_test))


[LightGBM] [Info] Number of positive: 281, number of negative: 275
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 595
[LightGBM] [Info] Number of data points in the train set: 556, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505396 -> initscore=0.021584
[LightGBM] [Info] Start training from score 0.021584


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 224, number of negative: 220
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000232 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 485
[LightGBM] [Info] Number of data points in the train set: 444, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.504505 -> initscore=0.018019
[LightGBM] [Info] Start training from score 0.018019
[LightGBM] [Info] Number of positive: 225, number of negative: 220
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 491
[LightGBM] [Info] Number of data points in the train set: 445, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505618 -> initscore=0.022473
[LightGBM] [Info] S

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Stacking Accuracy: 0.9071428571428571
