In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.append("..")

from pathlib import Path
import json
import logging

logging.getLogger().setLevel(logging.INFO)

In [2]:
from fake_news.classifier_evaluation import evaluate_classifiers
from fake_news.classifiers import (
    ConvolutionalNeuralNetworkClassifier,
    LogisticRegressionNewsClassifier,
    LSTMClassifier,
    MultinomialNaiveBayesClassifier,
    RandomForestClassifierClass,
    RecurrentNeuralNetworkClassifier,
    SupportVectorMachineClassifier
) 

In [3]:
CLASSIFIERS_DICT = {
    "logistic": (LogisticRegressionNewsClassifier, "ml"),
    "naive_bayes": (MultinomialNaiveBayesClassifier, "ml"),
    "random_forest": (RandomForestClassifierClass, "ml"),
    "svm": (SupportVectorMachineClassifier, "ml"),
    "cnn": (ConvolutionalNeuralNetworkClassifier, "dl"),
    "rnn": (RecurrentNeuralNetworkClassifier, "dl"),
    "lstm": (LSTMClassifier, "dl"),
}

ORIG_CLASSIFIER_PATHS_DICT = {
    "logistic": R"../fake_news/classifiers/logisticregression.pkl",
    "naive_bayes": R"../fake_news/classifiers/naivebayes.pkl",
    "random_forest": R"../fake_news/classifiers/rf_model.pkl",
    "svm": R"../fake_news/classifiers/svm_model.pkl",
    "cnn": R"../fake_news/classifiers/cnn.keras",
    "rnn": R"../fake_news/classifiers/rnn.keras",
    "lstm": R"../fake_news/classifiers/lstm_model.keras",
}

In [7]:
DATASET_DIR = Path("../data")
train_df = pd.read_csv(DATASET_DIR / "WELFake_clean_train.csv")
test_df = pd.read_csv(DATASET_DIR / "WELFake_clean_test.csv")

TOKENIZERS_DIR = Path("../fake_news/classifiers/tokenizers")
orig_tokenizer_paths = (
    str(TOKENIZERS_DIR / "ml_tokenizer.pickle"),
    str(TOKENIZERS_DIR / "dl_tokenizer.pickle") 
)
synthetic_names = ["tinyllama_real_articles.csv"] ## CHANGE SYNTHETIC DATA HERE

In [16]:
classifiers_to_evaluate = ["logistic", "naive_bayes", "random_forest", "svm"] ## CHANGE MODELS HERE
classifiers = [CLASSIFIERS_DICT[name] for name in classifiers_to_evaluate]
orig_classifier_paths = [ORIG_CLASSIFIER_PATHS_DICT[name] for name in classifiers_to_evaluate]

total_results = {}

for synthetic_name in synthetic_names:
    print("=== Started working on", synthetic_name)
    synth_df = pd.read_csv(DATASET_DIR / synthetic_name)

    results = evaluate_classifiers(
        classifiers=classifiers, 
        train_df=train_df,
        synth_df=synth_df,
        test_df=test_df,
        metrics=["acc", "auc", "f1"],
        orig_tokenizer_paths=orig_tokenizer_paths,
        combined_tokenizer_paths=orig_tokenizer_paths,
        orig_classifier_paths=orig_classifier_paths
    )

    total_results[synthetic_name] = {
        classif_name: {
            "orig": classif_result[0],
            "combined": classif_result[1]
        } 
        for classif_name, classif_result in zip(
            classifiers_to_evaluate, results
        )
    }

INFO:root:Loading tokenizers


=== Started working on tinyllama_real_articles.csv


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
INFO:root:Started evaluating <class 'fake_news.classifiers.logisticRegression.LogisticRegressionNewsClassifier'>
INFO:root:Fitting and predicting on original data


Model loaded successfully from: ../fake_news/classifiers/logisticregression.pkl


INFO:root:Fitting and predicting on combined data
INFO:root:Calculating metrics
INFO:root:Started evaluating <class 'fake_news.classifiers.NaiveBayes.MultinomialNaiveBayesClassifier'>
INFO:root:Fitting and predicting on original data


Model loaded successfully from: ../fake_news/classifiers/naivebayes.pkl


INFO:root:Fitting and predicting on combined data
INFO:root:Calculating metrics
INFO:root:Started evaluating <class 'sklearn.ensemble._forest.RandomForestClassifier'>
INFO:root:Fitting and predicting on original data


AttributeError: 'RandomForestClassifier' object has no attribute 'load_model'

In [6]:
with open("total_results_1.json", "w") as f:
    json.dump(total_results, f, indent=4)