# **NAIVE BAYES**

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1" # "0" o "1"

In [2]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
from utils import *

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, confusion_matrix

In [4]:
# -----------------------
# Model building function
# -----------------------

def build_model(alpha=1.0, fit_prior=False):
    """
    Builds a scikit-learn Pipeline with TF-IDF vectorization and Multinomial Naive Bayes classifier.

    Args:
        alpha (float): Smoothing parameter for Multinomial Naive Bayes.
        fit_prior (bool): Whether to learn class prior probabilities.

    Returns:
        Pipeline: A scikit-learn Pipeline object (TF-IDF + Multinomial Naive Bayes).
    """
    return Pipeline([
        ('tfidf', TfidfVectorizer(  # TD-IDF vectorization inside pipeline
            max_features=5000,      # limit to top 5000 features
            ngram_range=(1, 2),     # unigrams + bigrams
            stop_words="english"    # remove English stop words
        )),
        ('clf', MultinomialNB(      # Multinomial Naive Bayes classifier
            alpha=alpha,
            fit_prior=fit_prior
        ))
    ])

## VERSION 1: Dataset (Simple)

In [5]:
dataset_df = data_loading()

for name, df in dataset_df.items():
    print(f"Dataset: {name}, Number of samples: {len(df)}")

Dataset: Celebrity, Number of samples: 500
Dataset: CIDII, Number of samples: 722
Dataset: FaKES, Number of samples: 842
Dataset: FakeVsSatire, Number of samples: 486
Dataset: Horne, Number of samples: 326
Dataset: Infodemic, Number of samples: 10559
Dataset: ISOT, Number of samples: 44271
Dataset: Kaggle_clement, Number of samples: 39105
Dataset: Kaggle_meg, Number of samples: 12845
Dataset: LIAR_PLUS, Number of samples: 12784
Dataset: Politifact, Number of samples: 504
Dataset: Unipi_NDF, Number of samples: 554


  dfKaggleMeg['date'] = pd.to_datetime(dfKaggleMeg['date'], errors='coerce') # convert date column to datetime, coerce errors to NaT


In [6]:
# --------------------------------
# Fine-tuning on multiple datasets
# --------------------------------

datasets = {name: split_dataset(df) for name, df in dataset_df.items()} # split all datasets in train/val/test
model = build_model() # initialize model

results = {}

# sequential training
for i, (name, data) in enumerate(datasets.items()):
    print(f"\n=== Phase {i+1}: Training/Fine-tuning on {name} ===")
    
    X_train, y_train = data["train"]
    X_val, y_val = data["val"]
    X_test, y_test = data["test"]

    model.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val])) # fine-tune on train + val

    y_pred = model.predict(X_test)
    print(f"Classification Report after {name}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix after {name}:")
    print(confusion_matrix(y_test, y_pred))
    print(f"\nWeighted F1-score after {name}:", f1_score(y_test, y_pred, average="weighted"))


    # evaluation on all datasets
    print("\n--- Evaluation on all datasets ---")
    results[name] = {}
    for test_name, test_data in datasets.items(): # for each dataset
        X_te, y_te = test_data["test"]
        preds = model.predict(X_te)
        f1 = f1_score(y_te, preds, average="weighted")
        results[name][test_name] = f1
        print(f"Evaluation on {test_name}: Weighted F1 = {f1:.4f}")



=== Phase 1: Training/Fine-tuning on Celebrity ===
Classification Report after Celebrity:
              precision    recall  f1-score   support

           0       0.60      0.76      0.67        50
           1       0.68      0.50      0.57        50

    accuracy                           0.63       100
   macro avg       0.64      0.63      0.62       100
weighted avg       0.64      0.63      0.62       100

Confusion Matrix after Celebrity:
[[38 12]
 [25 25]]

Weighted F1-score after Celebrity: 0.6236395076797884

--- Evaluation on all datasets ---
Evaluation on Celebrity: Weighted F1 = 0.6236
Evaluation on CIDII: Weighted F1 = 0.5914
Evaluation on FaKES: Weighted F1 = 0.3956
Evaluation on FakeVsSatire: Weighted F1 = 0.3488
Evaluation on Horne: Weighted F1 = 0.5320
Evaluation on Infodemic: Weighted F1 = 0.5376
Evaluation on ISOT: Weighted F1 = 0.3772
Evaluation on Kaggle_clement: Weighted F1 = 0.4287
Evaluation on Kaggle_meg: Weighted F1 = 0.9009
Evaluation on LIAR_PLUS: Weighte

In [7]:
# ---------------
# Results summary
# ---------------

print("\n=== Results Summary ===")
for name, res in results.items():
    print(f"\nResults after training on {name}:")
    for test_name, f1 in res.items():
        print(f"  Test on {test_name}: Weighted F1 = {f1:.4f}")


=== Results Summary ===

Results after training on Celebrity:
  Test on Celebrity: Weighted F1 = 0.6236
  Test on CIDII: Weighted F1 = 0.5914
  Test on FaKES: Weighted F1 = 0.3956
  Test on FakeVsSatire: Weighted F1 = 0.3488
  Test on Horne: Weighted F1 = 0.5320
  Test on Infodemic: Weighted F1 = 0.5376
  Test on ISOT: Weighted F1 = 0.3772
  Test on Kaggle_clement: Weighted F1 = 0.4287
  Test on Kaggle_meg: Weighted F1 = 0.9009
  Test on LIAR_PLUS: Weighted F1 = 0.5222
  Test on Politifact: Weighted F1 = 0.6100
  Test on Unipi_NDF: Weighted F1 = 0.5009

Results after training on CIDII:
  Test on Celebrity: Weighted F1 = 0.5333
  Test on CIDII: Weighted F1 = 0.9517
  Test on FaKES: Weighted F1 = 0.4818
  Test on FakeVsSatire: Weighted F1 = 0.4998
  Test on Horne: Weighted F1 = 0.6197
  Test on Infodemic: Weighted F1 = 0.4743
  Test on ISOT: Weighted F1 = 0.6918
  Test on Kaggle_clement: Weighted F1 = 0.6757
  Test on Kaggle_meg: Weighted F1 = 0.3757
  Test on LIAR_PLUS: Weighted F1 = 0

## VERSION 2: Dataset by Topic

In [8]:
dataset_df = data_by_topic()

for topic, df in dataset_df.items():
    print(f"Topic: {topic}, Number of samples: {len(df)}")

  dfKaggleMeg['date'] = pd.to_datetime(dfKaggleMeg['date'], errors='coerce') # convert date column to datetime, coerce errors to NaT


Topic: politics, Number of samples: 97476
Topic: general, Number of samples: 12845
Topic: covid, Number of samples: 10559
Topic: syria, Number of samples: 842
Topic: islam, Number of samples: 722
Topic: notredame, Number of samples: 554
Topic: gossip, Number of samples: 500


In [9]:
# -------------------------------
# Fine-tuning on Dataset by Topic
# -------------------------------

datasets = {topic: split_dataset(df) for topic, df in dataset_df.items()} # split all datasets in train/val/test
model = build_model() # initialize model

results = {}

# sequential training
for i, (topic, data) in enumerate(datasets.items()):
    print(f"\n=== Phase {i+1}: Training/Fine-tuning on topic: {topic} ===")
    
    X_train, y_train = data["train"]
    X_val, y_val = data["val"]
    X_test, y_test = data["test"]

    model.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val])) # fine-tune on train + val

    y_pred = model.predict(X_test)
    print(f"Classification Report after topic {topic}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix after topic {topic}:")
    print(confusion_matrix(y_test, y_pred))
    print(f"\nWeighted F1-score after topic {topic}:", f1_score(y_test, y_pred, average="weighted"))


    # evaluation on all topics
    print("\n--- Evaluation on all topics ---")
    results[topic] = {}
    for test_topic, test_data in datasets.items(): # for each topic
        X_te, y_te = test_data["test"]
        preds = model.predict(X_te)
        f1 = f1_score(y_te, preds, average="weighted")
        results[topic][test_topic] = f1
        print(f"Evaluation on topic {test_topic}: Weighted F1 = {f1:.4f}")


=== Phase 1: Training/Fine-tuning on topic: politics ===
Classification Report after topic politics:
              precision    recall  f1-score   support

         0.0       0.91      0.89      0.90     10094
         1.0       0.88      0.90      0.89      9402

    accuracy                           0.90     19496
   macro avg       0.90      0.90      0.90     19496
weighted avg       0.90      0.90      0.90     19496

Confusion Matrix after topic politics:
[[8993 1101]
 [ 939 8463]]

Weighted F1-score after topic politics: 0.8953868054693163

--- Evaluation on all topics ---
Evaluation on topic politics: Weighted F1 = 0.8954
Evaluation on topic general: Weighted F1 = 0.3275
Evaluation on topic covid: Weighted F1 = 0.4314
Evaluation on topic syria: Weighted F1 = 0.3606
Evaluation on topic islam: Weighted F1 = 0.3781
Evaluation on topic notredame: Weighted F1 = 0.3041
Evaluation on topic gossip: Weighted F1 = 0.3763

=== Phase 2: Training/Fine-tuning on topic: general ===
Classifi

In [10]:
# ---------------
# Results summary
# ---------------

print("\n=== Results Summary ===")
for topic, res in results.items():
    print(f"\nResults after training on topic {topic}:")
    for test_topic, f1 in res.items():
        print(f"  Test on topic {test_topic}: Weighted F1 = {f1:.4f}")


=== Results Summary ===

Results after training on topic politics:
  Test on topic politics: Weighted F1 = 0.8954
  Test on topic general: Weighted F1 = 0.3275
  Test on topic covid: Weighted F1 = 0.4314
  Test on topic syria: Weighted F1 = 0.3606
  Test on topic islam: Weighted F1 = 0.3781
  Test on topic notredame: Weighted F1 = 0.3041
  Test on topic gossip: Weighted F1 = 0.3763

Results after training on topic general:
  Test on topic politics: Weighted F1 = 0.3831
  Test on topic general: Weighted F1 = 0.9429
  Test on topic covid: Weighted F1 = 0.5041
  Test on topic syria: Weighted F1 = 0.3736
  Test on topic islam: Weighted F1 = 0.4674
  Test on topic notredame: Weighted F1 = 0.4376
  Test on topic gossip: Weighted F1 = 0.4323

Results after training on topic covid:
  Test on topic politics: Weighted F1 = 0.3432
  Test on topic general: Weighted F1 = 0.0842
  Test on topic covid: Weighted F1 = 0.9219
  Test on topic syria: Weighted F1 = 0.3042
  Test on topic islam: Weighted F

## VERSION 3: Dataset by Date

In [11]:
dataset_df = data_by_date()

for date, df in dataset_df.items():
    print(f"Date: {date}, Number of samples: {len(df)}")

  dfKaggleMeg['date'] = pd.to_datetime(dfKaggleMeg['date'], errors='coerce') # convert date column to datetime, coerce errors to NaT


Date: 2011-2013, Number of samples: 55
Date: 2014, Number of samples: 114
Date: 2015, Number of samples: 84
Date: 2016, Number of samples: 49687
Date: 2017, Number of samples: 16657
Date: 2020, Number of samples: 10559


In [12]:
# ------------------------------
# Fine-tuning on Dataset by Date
# ------------------------------

dataset_df = data_by_date()

datasets = {date: split_dataset(df) for date, df in dataset_df.items()} # split all datasets in train/val/test
model = build_model() # initialize model

results = {}

# sequential training
for i, (date, data) in enumerate(datasets.items()):
    print(f"\n=== Phase {i+1}: Training/Fine-tuning on date: {date} ===")
    
    X_train, y_train = data["train"]
    X_val, y_val = data["val"]
    X_test, y_test = data["test"]

    model.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val])) # fine-tune on train + val

    y_pred = model.predict(X_test)
    print(f"Classification Report after date {date}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix after date {date}:")
    print(confusion_matrix(y_test, y_pred))
    print(f"\nWeighted F1-score after date {date}:", f1_score(y_test, y_pred, average="weighted"))


    # evaluation on all dates
    print("\n--- Evaluation on all dates ---")
    results[date] = {}
    for test_date, test_data in datasets.items(): # for each date
        X_te, y_te = test_data["test"]
        preds = model.predict(X_te)
        f1 = f1_score(y_te, preds, average="weighted")
        results[date][test_date] = f1
        print(f"Evaluation on date {test_date}: Weighted F1 = {f1:.4f}")

  dfKaggleMeg['date'] = pd.to_datetime(dfKaggleMeg['date'], errors='coerce') # convert date column to datetime, coerce errors to NaT



=== Phase 1: Training/Fine-tuning on date: 2011-2013 ===
Classification Report after date 2011-2013:
              precision    recall  f1-score   support

         0.0       0.33      0.20      0.25         5
         1.0       0.50      0.67      0.57         6

    accuracy                           0.45        11
   macro avg       0.42      0.43      0.41        11
weighted avg       0.42      0.45      0.43        11

Confusion Matrix after date 2011-2013:
[[1 4]
 [2 4]]

Weighted F1-score after date 2011-2013: 0.4253246753246753

--- Evaluation on all dates ---
Evaluation on date 2011-2013: Weighted F1 = 0.4253
Evaluation on date 2014: Weighted F1 = 0.3843
Evaluation on date 2015: Weighted F1 = 0.6997
Evaluation on date 2016: Weighted F1 = 0.4553
Evaluation on date 2017: Weighted F1 = 0.5166
Evaluation on date 2020: Weighted F1 = 0.4124

=== Phase 2: Training/Fine-tuning on date: 2014 ===
Classification Report after date 2014:
              precision    recall  f1-score   suppo

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Evaluation on date 2016: Weighted F1 = 0.3672
Evaluation on date 2017: Weighted F1 = 0.7153
Evaluation on date 2020: Weighted F1 = 0.4709

=== Phase 4: Training/Fine-tuning on date: 2016 ===
Classification Report after date 2016:
              precision    recall  f1-score   support

         0.0       0.95      0.96      0.95      5306
         1.0       0.95      0.94      0.94      4632

    accuracy                           0.95      9938
   macro avg       0.95      0.95      0.95      9938
weighted avg       0.95      0.95      0.95      9938

Confusion Matrix after date 2016:
[[5094  212]
 [ 295 4337]]

Weighted F1-score after date 2016: 0.9489510542724255

--- Evaluation on all dates ---
Evaluation on date 2011-2013: Weighted F1 = 0.2841
Evaluation on date 2014: Weighted F1 = 0.3578
Evaluation on date 2015: Weighted F1 = 0.3012
Evaluation on date 2016: Weighted F1 = 0.9490
Evaluation on date 2017: Weighted F1 = 0.9825
Evaluation on date 2020: Weighted F1 = 0.3803

=== Phase 5:

In [13]:
# ---------------
# Results summary
# ---------------

print("\n=== Results Summary ===")
for date, res in results.items():
    print(f"\nResults after training on date {date}:")
    for test_date, f1 in res.items():
        print(f"  Test on date {test_date}: Weighted F1 = {f1:.4f}")


=== Results Summary ===

Results after training on date 2011-2013:
  Test on date 2011-2013: Weighted F1 = 0.4253
  Test on date 2014: Weighted F1 = 0.3843
  Test on date 2015: Weighted F1 = 0.6997
  Test on date 2016: Weighted F1 = 0.4553
  Test on date 2017: Weighted F1 = 0.5166
  Test on date 2020: Weighted F1 = 0.4124

Results after training on date 2014:
  Test on date 2011-2013: Weighted F1 = 0.6364
  Test on date 2014: Weighted F1 = 0.4472
  Test on date 2015: Weighted F1 = 0.5394
  Test on date 2016: Weighted F1 = 0.3605
  Test on date 2017: Weighted F1 = 0.4053
  Test on date 2020: Weighted F1 = 0.4528

Results after training on date 2015:
  Test on date 2011-2013: Weighted F1 = 0.3636
  Test on date 2014: Weighted F1 = 0.3473
  Test on date 2015: Weighted F1 = 0.3665
  Test on date 2016: Weighted F1 = 0.3672
  Test on date 2017: Weighted F1 = 0.7153
  Test on date 2020: Weighted F1 = 0.4709

Results after training on date 2016:
  Test on date 2011-2013: Weighted F1 = 0.2841
