# Fake News Detection : Cross-dataset evaluation

In [18]:
import os
import pickle
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, make_scorer, accuracy_score, precision_score, recall_score, f1_score

## 1. Load data

### ISOT dataset

In [19]:
df_fake = pd.read_csv("data/Fake.csv")
df_true = pd.read_csv("data/True.csv")

df_fake["label"] = 0 # label 0 for fake news
df_true["label"] = 1 # label 1 for real news

# Combine the two datasets
df_isot = pd.concat([df_fake, df_true]).sample(frac=1, random_state=42).reset_index(drop=True)

# Split the data
X_isot = df_isot["text"]
y_isot = df_isot["label"]

### Fake or Real dataset

In [20]:
df_for = pd.read_csv("data/fake_or_real_news.csv")
# Drop first column
df_for = df_for.drop(df_for.columns[0], axis=1)
# Label 0 for fake news and label 1 for real news
df_for["label"] = df_for["label"].map({"FAKE": 0, "REAL": 1})
# Split the data
X_for = df_for["text"]
y_for = df_for["label"]

## 2. Load features

In [21]:
def load_feature_dict(folder_path):
    feature_dict = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".pkl"):
            feature_name = filename.replace("_features.pkl", "")
            with open(os.path.join(folder_path, filename), "rb") as f:
                feature_dict[feature_name] = pickle.load(f)
    return feature_dict

In [22]:
features_dict_isot = load_feature_dict("features/isot")
features_dict_for = load_feature_dict("features/FoR")

## 3. Cross-dataset evaluation

In [23]:
def cross_dataset_evaluation(features_dict_test, y_test, model_dir="models/isot"):
    results = []

    for feature_name, X_test in features_dict_test.items():
        for model_file in os.listdir(model_dir):
            if model_file.startswith(feature_name):
                model_path = os.path.join(model_dir, model_file)
                model = joblib.load(model_path)
                y_pred = model.predict(X_test)
                results.append({
                    "feature": feature_name,
                    "model": model_file.replace(f"{feature_name}_", "").replace(".joblib", ""),
                    "accuracy": accuracy_score(y_test, y_pred),
                    "precision": precision_score(y_test, y_pred, average="weighted", zero_division=0),
                    "recall": recall_score(y_test, y_pred, average="weighted", zero_division=0),
                    "f1_score": f1_score(y_test, y_pred, average="weighted", zero_division=0),
                })

    return pd.DataFrame(results)

### 3.1. Train on ISOT and test on Fake or Real

In [24]:
results_test_for = cross_dataset_evaluation(features_dict_for, y_for, "models/isot")

In [25]:
results_test_for

Unnamed: 0,feature,model,accuracy,precision,recall,f1_score
0,TF-IDF,GradientBoosting,0.518074,0.554217,0.518074,0.42323
1,TF-IDF,AdaBoost,0.498185,0.482712,0.498185,0.347195
2,TF-IDF,LogisticRegression,0.513496,0.598955,0.513496,0.380674
3,TF-IDF,RandomForest,0.521547,0.57038,0.521547,0.422378
4,TF-IDF,MLPClassifier,0.542068,0.562889,0.542068,0.501499
5,Linguistic,RandomForest,0.556275,0.591201,0.556275,0.509875
6,Linguistic,LogisticRegression,0.506709,0.507697,0.506709,0.493437
7,Linguistic,AdaBoost,0.552328,0.575824,0.552328,0.515334
8,Linguistic,MLPClassifier,0.581531,0.594831,0.581531,0.566612
9,Linguistic,GradientBoosting,0.562747,0.59549,0.562747,0.52226


In [16]:
results_test_for.to_csv("results/cross_results_isot_train_FoR_test.csv", index=False)

### 3.2. Train on Fake or Real and test on ISOT

In [26]:
results_test_isot = cross_dataset_evaluation(features_dict_isot, y_isot, "models/FoR")

In [27]:
results_test_isot

Unnamed: 0,feature,model,accuracy,precision,recall,f1_score
0,TF-IDF,GradientBoosting,0.557664,0.612199,0.557664,0.468052
1,TF-IDF,AdaBoost,0.553744,0.554142,0.553744,0.530743
2,TF-IDF,LogisticRegression,0.521449,0.487774,0.521449,0.370285
3,TF-IDF,RandomForest,0.542229,0.54469,0.542229,0.494898
4,TF-IDF,MLPClassifier,0.499399,0.473944,0.499399,0.437391
5,Linguistic,RandomForest,0.56272,0.613313,0.56272,0.48155
6,Linguistic,LogisticRegression,0.620829,0.667166,0.620829,0.58113
7,Linguistic,AdaBoost,0.596441,0.647563,0.596441,0.542458
8,Linguistic,MLPClassifier,0.651298,0.656752,0.651298,0.644034
9,Linguistic,GradientBoosting,0.560604,0.608013,0.560604,0.47903


In [17]:
results_test_isot.to_csv("results/cross_results_FoR_train_isot_test.csv", index=False)