In [1]:
import numpy as np
import pandas as pd
import ast

In [2]:
df = pd.read_csv("embedded_datasets/GloVe/GloVe_Embedded_Dataset.csv")
df["article_embedding"] = df["article_embedding"].apply(ast.literal_eval)
df = df[["article_embedding", "label"]]

# Training models


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Random Forest


In [4]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X = np.array(df["article_embedding"].tolist())
y = df["label"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distributions = {
    "n_estimators": randint(300, 700),
    "max_depth": randint(5, 20),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 20),
    "max_features": ["sqrt", "log2"],
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy", "log_loss"],
}

random_search_rf = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=param_distributions,
    n_iter=50,
    cv=3,
    scoring="accuracy",
    n_jobs=4,
)

random_search_rf.fit(X_train, y_train)

print("Best Parameters:", random_search_rf.best_params_)
print("Best Cross-Validated Accuracy:", random_search_rf.best_score_)

best_rf = random_search_rf.best_estimator_

Best Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 19, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 11, 'n_estimators': 336}
Best Cross-Validated Accuracy: 0.8491413712906204


In [None]:
import joblib

joblib.dump(best_rf, "GloVeModels/random_forest2_model.joblib")
print("Model saved successfully!")

Model saved successfully!


In [None]:
y_pred = best_rf.predict(X_test)

In [22]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.80      0.83      6169
           1       0.85      0.90      0.87      7691

    accuracy                           0.85     13860
   macro avg       0.86      0.85      0.85     13860
weighted avg       0.85      0.85      0.85     13860

[[4940 1229]
 [ 789 6902]]


# XGBoost


In [14]:
from xgboost import XGBClassifier
import xgboost as xgb

In [None]:
X = np.array(df["article_embedding"].tolist())
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

model = xgb.XGBClassifier(
    device="cuda",
    objective="binary:logistic",
    eval_metric="logloss",
)

param_grid = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "n_estimators": [300, 500, 700],
    "subsample": [0.8, 0.9],
    "colsample_bytree": [0.8, 0.9],
    "gamma": [0, 0.1],
    "min_child_weight": [1, 3, 5],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="neg_log_loss",
    cv=3,
    verbose=2,
    n_jobs=3,
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Log Loss Score:", -best_score)  # Convert back to positive log loss

Fitting 3 folds for each of 648 candidates, totalling 1944 fits
Best Parameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 500, 'subsample': 0.9}
Best Log Loss Score: 0.2480327198419424


In [18]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.86      0.87      6169
           1       0.89      0.91      0.90      7691

    accuracy                           0.89     13860
   macro avg       0.89      0.89      0.89     13860
weighted avg       0.89      0.89      0.89     13860

[[5299  870]
 [ 680 7011]]


In [19]:
import joblib

# Save the model to a file
joblib.dump(best_model, "GloVeModels/xgboost_model.joblib")
print("Model saved successfully!")

Model saved successfully!


# SVM


In [3]:
import pandas as pd
import ast
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.datasets import make_classification

In [None]:
X = np.array(df["article_embedding"].tolist())
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
svm_model = SVC()

param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto", 0.1],
}

grid_search = GridSearchCV(
    estimator=svm_model,
    param_grid=param_grid,
    scoring="accuracy",
    cv=3,
    verbose=2,
    n_jobs=4,
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Accuracy:", best_score)

test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Accuracy: 0.8884479831730584
Test Accuracy: 0.8947330447330447


In [7]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      6169
           1       0.89      0.92      0.91      7691

    accuracy                           0.89     13860
   macro avg       0.89      0.89      0.89     13860
weighted avg       0.89      0.89      0.89     13860

[[5338  831]
 [ 628 7063]]


In [8]:
import joblib

# Save the model to a file
joblib.dump(best_model, "GloVeModels/svm_model.joblib")
print("Model saved successfully!")

Model saved successfully!


# Generalization


In [9]:
df_generalization = pd.read_csv(
    "embedded_datasets/GloVe/GloVe_Embedded_Generalization_Dataset.csv"
)

In [10]:
df_generalization["article_embedding"] = df_generalization["article_embedding"].apply(
    ast.literal_eval
)

In [None]:
X = np.array(df_generalization["article_embedding"].tolist())
y = df_generalization["label"]

In [None]:
import joblib

model = joblib.load("GloVeModels/svm_model.joblib")

In [24]:
y_pred = rf_model.predict(X)

In [25]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y, y_pred))
print(confusion_matrix(y, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.78      0.71      3037
           1       0.72      0.59      0.65      2952

    accuracy                           0.68      5989
   macro avg       0.69      0.68      0.68      5989
weighted avg       0.69      0.68      0.68      5989

[[2357  680]
 [1215 1737]]
