In [1]:
import numpy as np
import pandas as pd
import ast

In [2]:
df = pd.read_csv("embedded_datasets/FastText/FastText_Embedded_Dataset.csv")
df["article_embedding"] = df["article_embedding"].apply(ast.literal_eval)
df = df[["article_embedding", "label"]]

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Random Forest


In [4]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X = np.array(df["article_embedding"].tolist())
y = df["label"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distributions = {
    "n_estimators": randint(300, 700),
    "max_depth": randint(5, 20),
    "min_samples_split": randint(2, 20),
    "min_samples_leaf": randint(1, 20),
    "max_features": ["sqrt", "log2"],
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy", "log_loss"],
}

random_search_rf = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=param_distributions,
    n_iter=50,
    cv=3,
    scoring="accuracy",
    n_jobs=4,
)

random_search_rf.fit(X_train, y_train)

print("Best Parameters:", random_search_rf.best_params_)
print("Best Cross-Validated Accuracy:", random_search_rf.best_score_)

best_rf = random_search_rf.best_estimator_

Best Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 7, 'n_estimators': 690}
Best Cross-Validated Accuracy: 0.8657731215905814


In [7]:
print("Evaluating based on test split")
y_pred = best_rf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Evaluating based on test split
              precision    recall  f1-score   support

           0       0.89      0.81      0.84      6169
           1       0.86      0.92      0.89      7691

    accuracy                           0.87     13860
   macro avg       0.87      0.86      0.86     13860
weighted avg       0.87      0.87      0.87     13860

[[4974 1195]
 [ 637 7054]]


In [8]:
import joblib

# Save the model to a file
joblib.dump(best_rf, "FastTextModels/random_forest_model2.joblib")
print("Model saved successfully!")

Model saved successfully!


# XGBoost


In [4]:
from xgboost import XGBClassifier
import xgboost as xgb

In [None]:
X = np.array(df["article_embedding"].tolist())
y = df["label"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

model = xgb.XGBClassifier(
    device="cuda",
    objective="binary:logistic",
    eval_metric="logloss",
)

param_grid = {
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "n_estimators": [300, 500, 700],
    "subsample": [0.8, 0.9],
    "colsample_bytree": [0.8, 0.9],
    "gamma": [0, 0.1],
    "min_child_weight": [1, 3, 5],
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="neg_log_loss",
    cv=3,
    verbose=2,
    n_jobs=3,
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Log Loss Score:", -best_score)

Fitting 3 folds for each of 648 candidates, totalling 1944 fits
Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 500, 'subsample': 0.8}
Best Log Loss Score: 0.22503331308759436


In [8]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.87      0.88      6169
           1       0.90      0.92      0.91      7691

    accuracy                           0.90     13860
   macro avg       0.90      0.89      0.90     13860
weighted avg       0.90      0.90      0.90     13860

[[5358  811]
 [ 619 7072]]


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [18]:
import joblib

# Save the model to a file
joblib.dump(best_model, "FastTextModels/xgboost_model.joblib")
print("Model saved successfully!")

Model saved successfully!


# SVM


In [4]:
import pandas as pd
import ast
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.datasets import make_classification

In [None]:
X = np.array(df["article_embedding"].tolist())
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
svm_model = SVC()


param_grid = {
    "C": [0.1, 1, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto", 0.1],
}


grid_search = GridSearchCV(
    estimator=svm_model,
    param_grid=param_grid,
    scoring="accuracy",
    cv=3,
    verbose=2,
    n_jobs=4,
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Accuracy:", best_score)

test_accuracy = best_model.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Accuracy: 0.8915506383337704
Test Accuracy: 0.8931457431457431


In [7]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88      6169
           1       0.89      0.92      0.91      7691

    accuracy                           0.89     13860
   macro avg       0.89      0.89      0.89     13860
weighted avg       0.89      0.89      0.89     13860

[[5293  876]
 [ 605 7086]]


In [9]:
import joblib

# Save the model to a file
joblib.dump(best_model, "FastTextModels/svm_model.joblib")
print("Model saved successfully!")

Model saved successfully!


# Generalization


In [9]:
df_generalization = pd.read_csv(
    "embedded_datasets/FastText/FastText_Embedded_Generalization_Dataset.csv"
)

In [10]:
df_generalization["article_embedding"] = df_generalization["article_embedding"].apply(
    ast.literal_eval
)

In [None]:
X = np.array(df_generalization["article_embedding"].tolist())
y = df_generalization["label"]

In [14]:
import joblib

# Load the model from the file
model = joblib.load("FastTextModels/random_forest_model.joblib")

In [15]:
y_pred = model.predict(X)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y, y_pred))
print(confusion_matrix(y, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.77      0.75      3037
           1       0.75      0.72      0.74      2952

    accuracy                           0.75      5989
   macro avg       0.75      0.74      0.75      5989
weighted avg       0.75      0.75      0.75      5989

[[2341  696]
 [ 829 2123]]


In [16]:
df_generalization

Unnamed: 0.1,Unnamed: 0,label,tokenized_text,article_embedding
0,0,0,"[['Daniel', 'Greenfield', ',', 'a', 'Shillman'...","[0.000995249138213694, -0.009249774739146233, ..."
1,1,0,"[['Google', 'Pinterest', 'Digg', 'Linkedin', '...","[0.006638066843152046, -0.027202058583498, 0.0..."
2,2,1,"[['U.S.', 'Secretary', 'of', 'State', 'John', ...","[0.0005832619499415159, -0.0407441109418869, 0..."
3,3,0,"[['—', 'Kaydee', 'King', '(', '@', 'KaydeeKing...","[0.005948025733232498, -0.03658789396286011, 0..."
4,4,1,"[['It', ""'s"", 'primary', 'day', 'in', 'New', '...","[-0.008395880460739136, -0.04971924051642418, ..."
...,...,...,...,...
5984,5985,1,"[['The', 'State', 'Department', 'told', 'the',...","[-0.009721430018544197, -0.033943723887205124,..."
5985,5986,0,"[['The', ""'P'"", 'in', 'PBS', 'Should', 'Stand'...","[-0.003399275941774249, -0.019133001565933228,..."
5986,5987,0,"[['Anti-Trump', 'Protesters', 'Are', 'Tools', ...","[-0.001935584587045014, -0.02292737364768982, ..."
5987,5988,1,"[['ADDIS', 'ABABA', ',', 'Ethiopia', '—Preside...","[-0.0008060219115577638, -0.024534158408641815..."
