In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.append("..")

from pathlib import Path
import json
import logging

logging.getLogger().setLevel(logging.INFO)

In [2]:
from fake_news.classifier_evaluation import evaluate_classifiers
from fake_news.classifiers import (
    ConvolutionalNeuralNetworkClassifier,
    LogisticRegressionNewsClassifier,
    LSTMClassifier,
    MultinomialNaiveBayesClassifier,
    RandomForestClassifierClass,
    RecurrentNeuralNetworkClassifier,
    SupportVectorMachineClassifier
) 

In [3]:
CLASSIFIERS_DICT = {
    "logistic": (LogisticRegressionNewsClassifier, "ml"),
    "naive_bayes": (MultinomialNaiveBayesClassifier, "ml"),
    "random_forest": (RandomForestClassifierClass, "ml"),
    "svm": (SupportVectorMachineClassifier, "ml"),
    "cnn": (ConvolutionalNeuralNetworkClassifier, "dl"),
    "rnn": (RecurrentNeuralNetworkClassifier, "dl"),
    "lstm": (LSTMClassifier, "dl"),
}

ORIG_CLASSIFIER_PATHS_DICT = {
    "logistic": R"../fake_news/classifiers/logisticregression.pkl",
    "naive_bayes": R"../fake_news/classifiers/naivebayes.pkl",
    "random_forest": R"../fake_news/classifiers/rf_model.pkl",
    "svm": R"../fake_news/classifiers/svm_model.pkl",
    "cnn": R"../fake_news/classifiers/cnn.keras",
    "rnn": R"../fake_news/classifiers/rnn.keras",
    "lstm": R"../fake_news/classifiers/lstm_model.keras",
}

In [4]:
DATASET_DIR = Path("../data")
train_df = pd.read_csv(DATASET_DIR / "WELFake_clean_train.csv")
test_df = pd.read_csv(DATASET_DIR / "WELFake_clean_test.csv")

TOKENIZERS_DIR = Path("../fake_news/classifiers/tokenizers")
orig_tokenizer_paths = (
    str(TOKENIZERS_DIR / "ml_tokenizer.pickle"),
    str(TOKENIZERS_DIR / "dl_tokenizer.pickle") 
)
synthetic_names = ["tinyllama_real_articles.csv"] ## CHANGE SYNTHETIC DATA HERE

In [5]:
classifiers_to_evaluate = ["rnn"] ## CHANGE MODELS HERE
classifiers = [CLASSIFIERS_DICT[name] for name in classifiers_to_evaluate]
orig_classifier_paths = [ORIG_CLASSIFIER_PATHS_DICT[name] for name in classifiers_to_evaluate]

total_results = {}

for synthetic_name in synthetic_names:
    print("=== Started working on", synthetic_name)
    synth_df = pd.read_csv(DATASET_DIR / synthetic_name)

    results = evaluate_classifiers(
        classifiers=classifiers, 
        train_df=train_df,
        synth_df=synth_df,
        test_df=test_df,
        metrics=["acc", "auc", "f1"],
        orig_tokenizer_paths=orig_tokenizer_paths,
        combined_tokenizer_paths=orig_tokenizer_paths,
        orig_classifier_paths=orig_classifier_paths
    )

    total_results[synthetic_name] = {
        classif_name: {
            "orig": classif_result[0],
            "combined": classif_result[1]
        } 
        for classif_name, classif_result in zip(
            classifiers_to_evaluate, results
        )
    }

=== Started working on tinyllama_real_articles.csv


INFO:root:Loading tokenizers
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
INFO:root:Started evaluating <class 'fake_news.classifiers.RNN.RecurrentNeuralNetworkClassifier'>
INFO:root:Fitting and predicting on original data
  saveable.load_own_variables(weights_store.get(inner_path))


Model loaded successfully from: ../fake_news/classifiers/rnn.keras
[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 37ms/step


INFO:root:Fitting and predicting on combined data


Epoch 1/10
[1m733/733[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 160ms/step - accuracy: 0.7455 - loss: 0.4942 - val_accuracy: 0.8700 - val_loss: 0.3040
Epoch 2/10
[1m733/733[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 162ms/step - accuracy: 0.9169 - loss: 0.2132 - val_accuracy: 0.9267 - val_loss: 0.2053
Epoch 3/10
[1m733/733[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 151ms/step - accuracy: 0.9537 - loss: 0.1232 - val_accuracy: 0.6892 - val_loss: 0.5394
Epoch 4/10
[1m733/733[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 157ms/step - accuracy: 0.8664 - loss: 0.2961 - val_accuracy: 0.9038 - val_loss: 0.2551
Epoch 5/10
[1m733/733[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 163ms/step - accuracy: 0.9680 - loss: 0.0924 - val_accuracy: 0.8968 - val_loss: 0.2825
Epoch 6/10
[1m733/733[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 149ms/step - accuracy: 0.9826 - loss: 0.0562 - val_accuracy: 0.9096 - val_loss: 0.2990
Epoc

INFO:root:Calculating metrics


ValueError: Found input variables with inconsistent numbers of samples: [14159, 1]

In [6]:
with open("total_results.json", "w") as f:
    json.dump(total_results, f, indent=4)