In [1]:
import pandas as pd
import ast

In [2]:
df = pd.read_csv("embedded_datasets/Word2Vec/Word2Vec_Embedded_Dataset.csv")

df["article_embedding"] = df["article_embedding"].apply(ast.literal_eval)

df = df[["article_embedding", "label"]]

In [3]:
df_generalization = pd.read_csv(
    "embedded_datasets/Word2Vec/Word2Vec_Embedded_Generalization_Dataset.csv"
)
df_generalization["article_embedding"] = df_generalization["article_embedding"].apply(
    ast.literal_eval
)
df_generalization = df_generalization[["article_embedding", "label"]]

# Training models


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [5]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Random Forest


In [6]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [None]:
X = np.array(df["article_embedding"].tolist())
y = df["label"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define hyperparameter distributions
param_distributions = {
    "n_estimators": randint(300, 700),
    "max_depth": randint(5, 20),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 20),
    "max_features": ["sqrt", "log2"],
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy", "log_loss"],
}

random_search_rf = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=param_distributions,
    n_iter=50,
    cv=3,
    scoring="accuracy",
    n_jobs=4,
)

random_search_rf.fit(X_train, y_train)

print("Best Parameters:", random_search_rf.best_params_)
print("Best Cross-Validated Accuracy:", random_search_rf.best_score_)

best_rf = random_search_rf.best_estimator_

Best Parameters: {'bootstrap': False, 'criterion': 'log_loss', 'max_depth': 19, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 14, 'n_estimators': 541}
Best Cross-Validated Accuracy: 0.8360812633321756


In [None]:
import joblib

joblib.dump(best_rf, "Word2VecModels/random_forest_model2.joblib")
print("Model saved successfully!")

Model saved successfully!


In [65]:
y_pred = best_rf.predict(X_test)

In [66]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.77      0.81      6110
           1       0.83      0.89      0.86      7750

    accuracy                           0.84     13860
   macro avg       0.84      0.83      0.83     13860
weighted avg       0.84      0.84      0.84     13860

[[4702 1408]
 [ 849 6901]]


In [68]:
X_generalization = np.array(
    df_generalization["article_embedding"].tolist()
)  # Convert list of embeddings to a 2D array
y_generalization = df_generalization["label"]

In [69]:
y_pred = rf_model.predict(X_generalization)

In [70]:
print(classification_report(y_generalization, y_pred))
print(confusion_matrix(y_generalization, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.79      0.73      3037
           1       0.74      0.60      0.66      2952

    accuracy                           0.70      5989
   macro avg       0.71      0.70      0.69      5989
weighted avg       0.70      0.70      0.70      5989

[[2412  625]
 [1182 1770]]


# XGBoost


In [4]:
from xgboost import XGBClassifier
import xgboost as xgb

In [None]:
X = np.array(df["article_embedding"].tolist())
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

model = xgb.XGBClassifier(
    device="cuda",
    objective="binary:logistic",
    eval_metric="logloss",
)

param_grid = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "n_estimators": [300, 500],
    "subsample": [0.8, 0.9],
    "colsample_bytree": [0.8, 0.9],
    "gamma": [0, 0.1],
    "min_child_weight": [1, 3, 5],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="neg_log_loss",
    cv=3,
    verbose=2,
    n_jobs=3,
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Log Loss Score:", -best_score)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 500, 'subsample': 0.9}
Best Log Loss Score: 0.2735121429205592


In [None]:
import joblib

joblib.dump(best_model, "xgboost_new_model.joblib")
print("Model saved successfully!")

Model saved successfully!


In [15]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.84      0.86      6169
           1       0.87      0.90      0.89      7691

    accuracy                           0.87     13860
   macro avg       0.87      0.87      0.87     13860
weighted avg       0.87      0.87      0.87     13860

[[5178  991]
 [ 755 6936]]


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




# SVM


In [3]:
import pandas as pd
import ast
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.datasets import make_classification

In [None]:
X = np.array(df["article_embedding"].tolist())
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
svm_model = SVC()

param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto", 0.1],
}

grid_search = GridSearchCV(
    estimator=svm_model,
    param_grid=param_grid,
    scoring="accuracy",
    cv=3,
    verbose=2,
    n_jobs=4,
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Accuracy:", best_score)

test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Accuracy: 0.8819540198491341
Test Accuracy: 0.8859307359307359


In [6]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87      6169
           1       0.89      0.90      0.90      7691

    accuracy                           0.89     13860
   macro avg       0.89      0.88      0.88     13860
weighted avg       0.89      0.89      0.89     13860

[[5319  850]
 [ 731 6960]]


In [7]:
import joblib

# Save the model to a file
joblib.dump(best_model, "Word2VecModels/svm_model.joblib")
print("Model saved successfully!")

Model saved successfully!


# Generalization


In [13]:
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd


df = pd.read_csv(
    "embedded_datasets/Word2Vec/Word2Vec_Embedded_Generalization_Dataset.csv"
)

In [14]:
df["article_embedding"] = df["article_embedding"].apply(ast.literal_eval)

In [15]:
df

Unnamed: 0,label,article_embedding
0,0,"[0.04956003278493881, 0.03957546502351761, 0.0..."
1,0,"[0.05896731838583946, 0.042814381420612335, 0...."
2,1,"[0.0299783106893301, 0.05943034961819649, 0.06..."
3,0,"[0.05501393973827362, 0.025511030107736588, 0...."
4,1,"[0.04850117489695549, 0.05607370659708977, 0.0..."
...,...,...
5984,1,"[0.008780215866863728, 0.029638497158885002, 0..."
5985,0,"[0.023696385324001312, 0.042197830975055695, 0..."
5986,0,"[0.03499678894877434, 0.04463842883706093, 0.0..."
5987,1,"[0.02768431045114994, 0.07343021780252457, 0.0..."


In [None]:
import numpy as np

X_gen = np.array(df["article_embedding"].tolist())


y_gen = df["label"]

In [17]:
import joblib

# Load the model from the file
model = joblib.load("Word2VecModels/random_forest_model.joblib")

In [18]:
model.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=False, max_depth=24, min_samples_leaf=3,
                       n_estimators=912, random_state=42)>

In [19]:
y_pred = model.predict(X_gen)

In [20]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_gen, y_pred))
print(confusion_matrix(y_gen, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.79      0.73      3037
           1       0.74      0.61      0.67      2952

    accuracy                           0.70      5989
   macro avg       0.71      0.70      0.70      5989
weighted avg       0.71      0.70      0.70      5989

[[2413  624]
 [1146 1806]]


In [141]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_gen, y_pred))
print(confusion_matrix(y_gen, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.95      0.82      3127
           1       0.92      0.64      0.75      3171

    accuracy                           0.79      6298
   macro avg       0.82      0.79      0.79      6298
weighted avg       0.82      0.79      0.79      6298

[[2961  166]
 [1152 2019]]


# EVAL


In [17]:
import pandas as pd
import ast
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [10]:
df = pd.read_csv("embedded_datasets/Word2Vec/Word2Vec_Embedded_Dataset.csv")

df["article_embedding"] = df["article_embedding"].apply(ast.literal_eval)

df = df[["article_embedding", "label"]]

In [11]:
df_generalization = pd.read_csv(
    "embedded_datasets/Word2Vec/Word2Vec_Embedded_Generalization_Dataset.csv"
)
df_generalization["article_embedding"] = df_generalization["article_embedding"].apply(
    ast.literal_eval
)
df_generalization = df_generalization[["article_embedding", "label"]]

In [None]:
X = np.array(df["article_embedding"].tolist())
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
X_gen = np.array(df_generalization["article_embedding"].tolist())
y_gen = df_generalization["label"]

In [14]:
model = joblib.load("Word2VecModels/random_forest_model2.joblib")

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.8414
Precision: 0.8353
Recall: 0.8896
F1 Score: 0.8616


In [19]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.78      0.81      6169
           1       0.84      0.89      0.86      7691

    accuracy                           0.84     13860
   macro avg       0.84      0.84      0.84     13860
weighted avg       0.84      0.84      0.84     13860

[[4820 1349]
 [ 849 6842]]


In [None]:
y_pred_gen = model.predict(X_gen)

accuracy = accuracy_score(y_gen, y_pred_gen)
precision = precision_score(y_gen, y_pred_gen)
recall = recall_score(y_gen, y_pred_gen)
f1 = f1_score(y_gen, y_pred_gen)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.6978
Precision: 0.7361
Recall: 0.6030
F1 Score: 0.6629


In [21]:
print(classification_report(y_gen, y_pred_gen))
print(confusion_matrix(y_gen, y_pred_gen))

              precision    recall  f1-score   support

           0       0.67      0.79      0.73      3037
           1       0.74      0.60      0.66      2952

    accuracy                           0.70      5989
   macro avg       0.70      0.70      0.69      5989
weighted avg       0.70      0.70      0.69      5989

[[2399  638]
 [1172 1780]]
