In [1]:
import pandas as pd

In [3]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [5]:
train_data_set = pd.read_csv('retrain_z_score.csv')
test_data_set = pd.read_csv('model_b_prime_test.csv')

In [7]:
selected_features = [
    "description length",
    "#posts",
    "nums/length username",
    "#followers",
    "#follows",
    "fullname words",
    "private",
    "nums/length fullname",
    "external URL",
    # "name==username",
    # "profile pic"/

] #x_data_set에서 p-value를 검증을 했을 때 결과와 유의미하다고 판단된 특성들 만을 활용해서 학습

In [11]:
df_X_selected = train_data_set[selected_features + ['fake']]
df_insta_selected = test_data_set[selected_features + ['fake']]

In [13]:
X = df_X_selected.drop(columns=["fake"])
y = df_X_selected["fake"]

In [15]:
X_train, X_vali, y_train, y_vali = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# 개별 모델 정의
model1 = GradientBoostingClassifier(random_state=42)
model2 = RandomForestClassifier(random_state=42)
model3 = lgb.LGBMClassifier(random_state=42)
model4 = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [19]:
# 소프트 보팅 앙상블
voting_clf = VotingClassifier(
    estimators=[
        ('gb', model1),
        ('rf', model2),
        ('lgbm', model3),
        ('xgb', model4)
    ],
    voting='soft'
)

In [21]:
# 학습 및 평가
voting_clf.fit(X_train, y_train)
preds = voting_clf.predict(X_vali)

# 결과 출력
print("Voting Classifier Accuracy:", accuracy_score(y_vali, preds))
print(classification_report(y_vali, preds, target_names=["Real", "Fake"]))


[LightGBM] [Info] Number of positive: 1717, number of negative: 1610
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1025
[LightGBM] [Info] Number of data points in the train set: 3327, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.516081 -> initscore=0.064344
[LightGBM] [Info] Start training from score 0.064344


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Voting Classifier Accuracy: 0.7932692307692307
              precision    recall  f1-score   support

        Real       0.83      0.75      0.79       433
        Fake       0.76      0.84      0.80       399

    accuracy                           0.79       832
   macro avg       0.80      0.79      0.79       832
weighted avg       0.80      0.79      0.79       832



In [23]:
X_test = df_insta_selected.drop(columns=["fake"])
y_test = df_insta_selected["fake"]

In [25]:
preds = voting_clf.predict(X_test)

# 결과 출력
print("Voting Classifier Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds, target_names=["Real", "Fake"]))

Voting Classifier Accuracy: 0.6954022988505747
              precision    recall  f1-score   support

        Real       0.90      0.44      0.59       348
        Fake       0.63      0.95      0.76       348

    accuracy                           0.70       696
   macro avg       0.76      0.70      0.67       696
weighted avg       0.76      0.70      0.67       696

