In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import joblib

df = pd.read_pickle('data_with_embeddings_fix.pkl')

X_text_embeddings = np.stack(df['text_embeddings'].values)  # Vector 384 chiều
X_category_id = df['category_id'].values.reshape(-1, 1)
X_days_to_trending = df['days_to_trending'].values.reshape(-1, 1)
X_video_durations = df['video_durations'].values.reshape(-1, 1)  # Feature mới
y = df['Y'].values

scaler_category = StandardScaler()
scaler_days = StandardScaler()
scaler_duration = StandardScaler()
scaler_text = StandardScaler()

X_category_id = scaler_category.fit_transform(X_category_id)
X_days_to_trending = scaler_days.fit_transform(X_days_to_trending)
X_video_durations = scaler_duration.fit_transform(X_video_durations)
X_text_embeddings = scaler_text.fit_transform(X_text_embeddings)

joblib.dump(scaler_category, 'scaler_category_xgb.pkl')
joblib.dump(scaler_days, 'scaler_days_xgb.pkl')
joblib.dump(scaler_duration, 'scaler_duration_xgb.pkl')
joblib.dump(scaler_text, 'scaler_text_xgb.pkl')

X_combined = np.hstack([X_category_id, X_days_to_trending, X_video_durations, X_text_embeddings])

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_combined, y)

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=4,
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f"\nĐộ chính xác (Accuracy): {accuracy:.4f}")
print(f"F1-score (Weighted): {f1:.4f}")
print(f"Precision (Weighted): {precision:.4f}")
print(f"Recall (Weighted): {recall:.4f}")

labels = {0: "Not popular", 1: "Controversy", 2: "Decent", 3: "Overwhelming positive"}
print(classification_report(y_test, y_pred, target_names=[labels[i] for i in range(4)]))

print(f"Phân bố lớp dự đoán: {np.bincount(y_pred)}")

# Lưu mô hình
joblib.dump(xgb_model, 'xgboost_model.pkl')


Độ chính xác (Accuracy): 0.9335
F1-score (Weighted): 0.9332
Precision (Weighted): 0.9333
Recall (Weighted): 0.9335
                       precision    recall  f1-score   support

          Not popular       0.89      0.92      0.91      4655
          Controversy       0.99      1.00      0.99      4675
               Decent       0.90      0.86      0.88      4585
Overwhelming positive       0.95      0.95      0.95      4565

             accuracy                           0.93     18480
            macro avg       0.93      0.93      0.93     18480
         weighted avg       0.93      0.93      0.93     18480

Phân bố lớp dự đoán: [4778 4723 4393 4586]


['xgboost_model.pkl']