# **NAIVE BAYES**

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1" # "0" o "1"

In [2]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, confusion_matrix

In [4]:
# -------------
# Dataset paths
# -------------

DATA_PATH_CELEBRITY = "datasets/Celebrity/df_celebrity.csv"
DATA_PATH_CIDII = "datasets/CIDII/df_cidii.csv"
DATA_PATH_FAKES = "datasets/FaKES/df_fakes.csv"
DATA_PATH_FAKEVSATIRE = "datasets/FakeVsSatire/df_fakevssatire.csv"
DATA_PATH_HORNE = "datasets/Horne/df_horne_all.csv"
DATA_PATH_INFODEMIC = "datasets/Infodemic/df_infodemic.csv"
DATA_PATH_ISOT = "datasets/ISOT/df_isot.csv"
DATA_PATH_KAGGLE_CLEMENT = "datasets/Kaggle_clement/df_kaggle_clement_def.csv"
DATA_PATH_KAGGLE_MEG = "datasets/Kaggle_meg/fake.csv"
DATA_PATH_LIAR_PLUS = "datasets/LIAR_PLUS/df_liar_plus_complete.csv"
DATA_PATH_POLITIFACT = "datasets/Politifact/df_politifact.csv"
DATA_PATH_UNIPI_NDF = "datasets/Unipi_NDF/df_ndf.csv"

In [5]:
# ----------------------------
# Load and preprocess datasets
# ----------------------------

# Celebrity
dfCelebrity = pd.read_csv(DATA_PATH_CELEBRITY, sep="\t", encoding="utf-8")
dfCelebrity = dfCelebrity[["texts", "labels"]] # keep only relevant columns
dfCelebrity = dfCelebrity.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# CIDII
dfCidii = pd.read_csv(DATA_PATH_CIDII, sep="\t", encoding="utf-8")
dfCidii = dfCidii[["texts", "labels"]] # keep only relevant columns
dfCidii = dfCidii.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# FaKES
dfFakes = pd.read_csv(DATA_PATH_FAKES, sep="\t", encoding="utf-8")
dfFakes = dfFakes[["texts", "labels"]] # keep only relevant columns
dfFakes = dfFakes.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# FakeVsSatire
dfFakeVsSatire = pd.read_csv(DATA_PATH_FAKEVSATIRE, sep="\t", encoding="utf-8")
dfFakeVsSatire = dfFakeVsSatire[["texts", "labels"]] # keep only relevant columns
dfFakeVsSatire = dfFakeVsSatire.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# Horne
dfHorne = pd.read_csv(DATA_PATH_HORNE, sep="\t", encoding="utf-8")
dfHorne = dfHorne[["texts", "labels"]] # keep only relevant columns
dfHorne = dfHorne.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# Infodemic
dfInfodemic = pd.read_csv(DATA_PATH_INFODEMIC, sep="\t", encoding="utf-8")
dfInfodemic = dfInfodemic[["texts", "labels"]] # keep only relevant columns
dfInfodemic = dfInfodemic.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# ISOT
dfIsot = pd.read_csv(DATA_PATH_ISOT, sep="\t", encoding="utf-8")
dfIsot = dfIsot[["texts", "labels"]] # keep only relevant columns
dfIsot = dfIsot.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# Kaggle_clement
dfKaggleClement = pd.read_csv(DATA_PATH_KAGGLE_CLEMENT, encoding="utf-8")
dfKaggleClement["texts"] = dfKaggleClement["title"].astype(str) + " " + dfKaggleClement["text"].astype(str) # merge title and text
dfKaggleClement = dfKaggleClement[["texts", "labels"]] # keep only relevant columns
dfKaggleClement = dfKaggleClement.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# Kaggle_meg
dfKaggleMeg = pd.read_csv(DATA_PATH_KAGGLE_MEG, encoding="utf-8")
dfKaggleMeg["texts"] = dfKaggleMeg["title"].astype(str) + " " + dfKaggleMeg["text"].astype(str) # merge title and text
dfKaggleMeg["labels"] = dfKaggleMeg["spam_score"].apply(lambda x: 1 if x > 0.5 else 0) # create binary labels based on spam_score
dfKaggleMeg = dfKaggleMeg[["texts", "labels"]] # keep only relevant columns
dfKaggleMeg = dfKaggleMeg.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# LIAR_PLUS
dfLiarPlus = pd.read_csv(DATA_PATH_LIAR_PLUS, sep="\t", encoding="utf-8")
dfLiarPlus = dfLiarPlus[["texts", "labels"]] # keep only relevant columns
dfLiarPlus = dfLiarPlus.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# Politifact
dfPolitifact = pd.read_csv(DATA_PATH_POLITIFACT, sep="\t", encoding="utf-8")
dfPolitifact = dfPolitifact[["texts", "labels"]] # keep only relevant columns
dfPolitifact = dfPolitifact.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN
# Unipi_NDF
dfNDF = pd.read_csv(DATA_PATH_UNIPI_NDF, sep="\t", encoding="utf-8")
dfNDF = dfNDF[["texts", "labels"]] # keep only relevant columns
dfNDF = dfNDF.dropna(subset=["texts", "labels"]) # remove rows where texts OR labels are NaN

## VERSION 1: Fine-Tuning Only

In [6]:
# --------------------------
# Dataset splitting function
# --------------------------

def split_dataset(df, test_size=0.2, val_size=0.2, random_state=42):
    X = df["texts"].astype(str)
    y = df["labels"]

    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=(test_size + val_size), stratify=y, random_state=random_state
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=random_state
    )

    return {
        "train": (X_train, y_train),
        "val": (X_val, y_val),
        "test": (X_test, y_test)
    }

In [7]:
# -----------------------
# Model building function
# -----------------------

def build_model(alpha=1.0, fit_prior=False):
    """
    Builds a scikit-learn Pipeline with TF-IDF vectorization and Multinomial Naive Bayes classifier.

    Args:
        alpha (float): Smoothing parameter for Multinomial Naive Bayes.
        fit_prior (bool): Whether to learn class prior probabilities.

    Returns:
        Pipeline: A scikit-learn Pipeline object (TF-IDF + Multinomial Naive Bayes).
    """
    return Pipeline([
        ('tfidf', TfidfVectorizer(  # TD-IDF vectorization inside pipeline
            max_features=5000,      # limit to top 5000 features
            ngram_range=(1, 2),     # unigrams + bigrams
            stop_words="english"    # remove English stop words
        )),
        ('clf', MultinomialNB(      # Multinomial Naive Bayes classifier
            alpha=alpha,
            fit_prior=fit_prior
        ))
    ])

In [8]:
# --------------------------------
# Training and evaluation function
# --------------------------------

def train_and_evaluate(model, X_train, y_train, X_val, y_val):
    """
    Trains the model and evaluates it on the validation set using weighted F1-score.

    Args:
        model (Pipeline): The scikit-learn Pipeline model to train.
        X_train (array-like): Training features.
        y_train (array-like): Training labels.
        X_val (array-like): Validation features.
        y_val (array-like): Validation labels.

    Returns:
        float: Weighted F1-score on the validation set.
    """

    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    f1 = f1_score(y_val, preds, average="weighted") # weighted F1-score: average for label imbalance
    return f1


In [9]:
# --------------------------------
# Fine-tuning on multiple datasets
# --------------------------------

datasets_df = {
    "Celebrity": dfCelebrity,
    "CIDII": dfCidii,
    "FaKES": dfFakes,
    "FakeVsSatire": dfFakeVsSatire,
    "Horne": dfHorne,
    "Infodemic": dfInfodemic,
    "ISOT": dfIsot,
    "Kaggle_clement": dfKaggleClement,
    "Kaggle_meg": dfKaggleMeg,
    "LIAR_PLUS": dfLiarPlus,
    "Politifact": dfPolitifact,
    "Unipi_NDF": dfNDF
}

datasets = {name: split_dataset(df) for name, df in datasets_df.items()} # split all datasets
model = build_model() # initialize model

results = {}

# sequential training
for i, (name, data) in enumerate(datasets.items()):
    print(f"\n=== Phase {i+1}: Training/Fine-tuning on {name} ===")
    
    X_train, y_train = data["train"]
    X_val, y_val = data["val"]
    X_test, y_test = data["test"]

    model.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val])) # fine-tune on train + val

    y_pred = model.predict(X_test)
    print(f"Classification Report after {name}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix after {name}:")
    print(confusion_matrix(y_test, y_pred))
    print(f"\nWeighted F1-score after {name}:", f1_score(y_test, y_pred, average="weighted"))


    # evaluation on all datasets
    print("\n--- Evaluation on all datasets ---")
    results[name] = {}
    for test_name, test_data in datasets.items(): # for each dataset
        X_te, y_te = test_data["test"]
        preds = model.predict(X_te)
        f1 = f1_score(y_te, preds, average="weighted")
        results[name][test_name] = f1
        print(f"Evaluation on {test_name}: Weighted F1 = {f1:.4f}")



=== Phase 1: Training/Fine-tuning on Celebrity ===
Classification Report after Celebrity:
              precision    recall  f1-score   support

           0       0.60      0.76      0.67        50
           1       0.68      0.50      0.57        50

    accuracy                           0.63       100
   macro avg       0.64      0.63      0.62       100
weighted avg       0.64      0.63      0.62       100

Confusion Matrix after Celebrity:
[[38 12]
 [25 25]]

Weighted F1-score after Celebrity: 0.6236395076797884

--- Evaluation on all datasets ---
Evaluation on Celebrity: Weighted F1 = 0.6236
Evaluation on CIDII: Weighted F1 = 0.5914
Evaluation on FaKES: Weighted F1 = 0.4058
Evaluation on FakeVsSatire: Weighted F1 = 0.3488
Evaluation on Horne: Weighted F1 = 0.5320
Evaluation on Infodemic: Weighted F1 = 0.5376
Evaluation on ISOT: Weighted F1 = 0.3772
Evaluation on Kaggle_clement: Weighted F1 = 0.3633
Evaluation on Kaggle_meg: Weighted F1 = 0.8957
Evaluation on LIAR_PLUS: Weighte

In [10]:
# ---------------
# Results summary
# ---------------

print("\n=== Results Summary ===")
for name, res in results.items():
    print(f"\nResults after training on {name}:")
    for test_name, f1 in res.items():
        print(f"  Test on {test_name}: Weighted F1 = {f1:.4f}")


=== Results Summary ===

Results after training on Celebrity:
  Test on Celebrity: Weighted F1 = 0.6236
  Test on CIDII: Weighted F1 = 0.5914
  Test on FaKES: Weighted F1 = 0.4058
  Test on FakeVsSatire: Weighted F1 = 0.3488
  Test on Horne: Weighted F1 = 0.5320
  Test on Infodemic: Weighted F1 = 0.5376
  Test on ISOT: Weighted F1 = 0.3772
  Test on Kaggle_clement: Weighted F1 = 0.3633
  Test on Kaggle_meg: Weighted F1 = 0.8957
  Test on LIAR_PLUS: Weighted F1 = 0.5222
  Test on Politifact: Weighted F1 = 0.6100
  Test on Unipi_NDF: Weighted F1 = 0.5009

Results after training on CIDII:
  Test on Celebrity: Weighted F1 = 0.5333
  Test on CIDII: Weighted F1 = 0.9517
  Test on FaKES: Weighted F1 = 0.4577
  Test on FakeVsSatire: Weighted F1 = 0.4998
  Test on Horne: Weighted F1 = 0.6197
  Test on Infodemic: Weighted F1 = 0.4743
  Test on ISOT: Weighted F1 = 0.6918
  Test on Kaggle_clement: Weighted F1 = 0.6745
  Test on Kaggle_meg: Weighted F1 = 0.3854
  Test on LIAR_PLUS: Weighted F1 = 0