In [1]:
import numpy as np
import pandas as pd
import ast

In [2]:
df = pd.read_csv("embedded_datasets/RoBERTa/RoBERTa_Embedded_Dataset.csv")
df["article_embedding"] = df["article_embedding"].apply(ast.literal_eval)
df = df[["article_embedding", "label"]]

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# XGBoost


In [4]:
from xgboost import XGBClassifier
import xgboost as xgb

In [None]:
X = np.array(df["article_embedding"].tolist())
y = df["label"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

model = xgb.XGBClassifier(
    device="cuda",
    objective="binary:logistic",
    eval_metric="logloss",
)

param_grid = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "n_estimators": [300, 500, 700],
    "subsample": [0.8, 0.9],
    "colsample_bytree": [0.8, 0.9],
    "gamma": [0, 0.1],
    "min_child_weight": [1, 3, 5],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="neg_log_loss",
    cv=3,
    verbose=2,
    n_jobs=1,
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Log Loss Score:", -best_score)

In [10]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.90      0.91      6036
           1       0.93      0.93      0.93      7824

    accuracy                           0.92     13860
   macro avg       0.92      0.92      0.92     13860
weighted avg       0.92      0.92      0.92     13860

[[5459  577]
 [ 545 7279]]


In [18]:
import joblib

# Save the model to a file
joblib.dump(best_model, "RoBERTaModels/xgboost_model.joblib")
print("Model saved successfully!")

Model saved successfully!


# Random Forest


In [4]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X = np.array(df["article_embedding"].tolist())
y = df["label"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distributions = {
    "n_estimators": randint(300, 700),
    "max_depth": randint(5, 20),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 20),
    "max_features": ["sqrt", "log2"],
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy", "log_loss"],
}

random_search_rf = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=param_distributions,
    n_iter=50,
    cv=3,
    scoring="accuracy",
    n_jobs=4,
)

random_search_rf.fit(X_train, y_train)

print("Best Parameters:", random_search_rf.best_params_)
print("Best Cross-Validated Accuracy:", random_search_rf.best_score_)

best_rf = random_search_rf.best_estimator_

Best Parameters: {'bootstrap': False, 'criterion': 'log_loss', 'max_depth': 13, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 496}
Best Cross-Validated Accuracy: 0.8911937535740231


In [7]:
import joblib

# Save the model to a file
joblib.dump(best_rf, "RoBERTaModels/random_forest_model2.joblib")
print("Model saved successfully!")

Model saved successfully!


In [8]:
y_pred = best_rf.predict(X_test)

In [9]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.86      0.87      6036
           1       0.90      0.91      0.91      7824

    accuracy                           0.89     13860
   macro avg       0.89      0.89      0.89     13860
weighted avg       0.89      0.89      0.89     13860

[[5218  818]
 [ 673 7151]]


# SVM


In [4]:
import pandas as pd
import ast
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.datasets import make_classification

In [None]:
X = np.array(df["article_embedding"].tolist())
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
svm_model = SVC()

param_grid = {
    "C": [1],
    "kernel": ["rbf"],
}


grid_search = GridSearchCV(
    estimator=svm_model,
    param_grid=param_grid,
    scoring="accuracy",
    cv=3,
    verbose=2,
    n_jobs=4,
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Accuracy:", best_score)

test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Parameters: {'C': 1, 'kernel': 'rbf'}
Best Accuracy: 0.9048847607677901
Test Accuracy: 0.9093795093795094


In [47]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.88      0.89      6036
           1       0.91      0.93      0.92      7824

    accuracy                           0.91     13860
   macro avg       0.91      0.91      0.91     13860
weighted avg       0.91      0.91      0.91     13860

[[5321  715]
 [ 541 7283]]


In [48]:
import joblib

# Save the model to a file
joblib.dump(best_model, "RoBERTaModels/svm_model.joblib")
print("Model saved successfully!")

Model saved successfully!


# Generalization


In [10]:
df_generalization = pd.read_csv(
    "embedded_datasets/RoBERTa/RoBERTa_Embedded_Generalization_Dataset.csv"
)

In [11]:
df_generalization["article_embedding"] = df_generalization["article_embedding"].apply(
    ast.literal_eval
)

In [12]:
df_generalization.label.value_counts()

label
0    3038
1    2952
Name: count, dtype: int64

In [None]:
X = np.array(df_generalization["article_embedding"].tolist())
y = df_generalization["label"]

In [None]:
import joblib

model = joblib.load("RoBERTaModels/svm_model.joblib")

In [14]:
y_pred = best_rf.predict(X)

In [15]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y, y_pred))
print(confusion_matrix(y, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.87      0.82      3038
           1       0.84      0.74      0.79      2952

    accuracy                           0.81      5990
   macro avg       0.81      0.80      0.80      5990
weighted avg       0.81      0.81      0.80      5990

[[2628  410]
 [ 754 2198]]
