# **BERT**

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # "0" o "1"

In [2]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
from utils import *

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from datasets import Dataset

from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)

  from .autonotebook import tqdm as notebook_tqdm
2025-10-28 17:09:58.971754: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-28 17:09:59.032039: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-28 17:10:00.294838: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [4]:
# ------------------------------
# Build model function
# ------------------------------

def build_model(learning_rate=2e-5, weight_decay=0.1):
    """
    Builds a BERT model (bert-base-cased) for sequence classification.

    Args:
        learning_rate (float): Learning rate for the optimizer.
        weight_decay (float): Weight decay for AdamW optimizer.

    Returns:
        model (BertForSequenceClassification): HuggingFace BERT model.
        tokenizer (BertTokenizer): Tokenizer associated with the model.
        train_args (TrainingArguments): Default training configuration.
    """
    
    model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

    train_args = TrainingArguments(
        output_dir="./bert_finetune_output",
        eval_strategy="epoch",
        save_strategy="no",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        logging_dir="./logs",
        load_best_model_at_end=False,
        logging_steps=50,
        seed=42,
    )

    return model, tokenizer, train_args

In [None]:
# ----------------------
# Tokenization functions
# ----------------------

def tokenize_function(examples, tokenizer, max_len=128):
    """
    Tokenizes the input examples using the provided tokenizer.

    Args:
        examples (dict): A dictionary containing the text data to be tokenized.
        tokenizer (BertTokenizer): The tokenizer to use for tokenization.
        max_len (int): Maximum length for padding/truncation.

    Returns:
        dict: Tokenized inputs with padding and truncation applied.
    """

    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_len,
    )


def tokenize_datasets(datasets, tokenizer):
    """
    Tokenizes multiple datasets using the provided tokenizer.

    Args:
        datasets (dict): A dictionary where keys are dataset names and values are dictionaries with 'train', 'val', and 'test' splits.
        tokenizer (BertTokenizer): The tokenizer to use for tokenization.

    Returns:
        dict: A dictionary with the same keys as input datasets, but with tokenized datasets.
    """

    tokenized_datasets = {}
    for name, data in datasets.items():
        print(f"\n=== Tokenizing dataset: {name} ===")

        train_dataset = Dataset.from_dict({"text": data["train"][0], "label": data["train"][1].astype(int)})
        val_dataset = Dataset.from_dict({"text": data["val"][0], "label": data["val"][1].astype(int)})
        test_dataset = Dataset.from_dict({"text": data["test"][0], "label": data["test"][1].astype(int)})

        train_tokenized = train_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
        val_tokenized = val_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
        test_tokenized = test_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)

        tokenized_datasets[name] = {
            "train": train_tokenized,
            "val": val_tokenized,
            "test": test_tokenized
        }
    return tokenized_datasets

In [6]:
dataset_df = data_loading() # load all datasets

for name, df in dataset_df.items():
    print(f"Name: {name}, Number of samples: {len(df)}")

#dataset_df = dict(list(dataset_df.items())[:3])

print("\nSplitting datasets into train/val/test...")
datasets = {name: split_dataset(df) for name, df in dataset_df.items()} # split all datasets in train/val/test

model, tokenizer, train_args = build_model()

print("\nComputing tokenized datasets...")
datasets =  tokenize_datasets(datasets, tokenizer) # tokenize all datasets


  dfKaggleMeg['date'] = pd.to_datetime(dfKaggleMeg['date'], errors='coerce') # convert date column to datetime, coerce errors to NaT


Name: Celebrity, Number of samples: 500
Name: CIDII, Number of samples: 722
Name: FaKES, Number of samples: 842
Name: FakeVsSatire, Number of samples: 486
Name: Horne, Number of samples: 326
Name: Infodemic, Number of samples: 10559
Name: ISOT, Number of samples: 44271
Name: Kaggle_clement, Number of samples: 39105
Name: Kaggle_meg, Number of samples: 12845
Name: LIAR_PLUS, Number of samples: 12784
Name: Politifact, Number of samples: 504
Name: Unipi_NDF, Number of samples: 554

Splitting datasets into train/val/test...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Computing tokenized datasets...


Map: 100%|██████████| 300/300 [00:01<00:00, 165.59 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 211.18 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 181.66 examples/s]
Map: 100%|██████████| 433/433 [00:00<00:00, 621.28 examples/s]
Map: 100%|██████████| 144/144 [00:00<00:00, 627.87 examples/s]
Map: 100%|██████████| 145/145 [00:00<00:00, 600.10 examples/s]
Map: 100%|██████████| 505/505 [00:02<00:00, 246.57 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 223.63 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 232.02 examples/s]
Map: 100%|██████████| 291/291 [00:01<00:00, 158.96 examples/s]
Map: 100%|██████████| 97/97 [00:00<00:00, 202.18 examples/s]
Map: 100%|██████████| 98/98 [00:00<00:00, 169.37 examples/s]
Map: 100%|██████████| 195/195 [00:01<00:00, 119.75 examples/s]
Map: 100%|██████████| 65/65 [00:00<00:00, 116.85 examples/s]
Map: 100%|██████████| 66/66 [00:00<00:00, 126.39 examples/s]
Map: 100%|██████████| 6335/6335 [00:03<00:00, 1790.29 examples/

In [7]:
# -------------------------------
# Fine-tuning on multiple datasets
# --------------------------------

model, tokenizer, train_args = build_model()

results = {}

for i, (name, data) in enumerate(datasets.items()):
    print(f"\n=== Phase {i+1}: Fine-tuning on {name} ===")

    X_train, y_train = data["train"]
    X_val, y_val = data["val"]
    X_test, y_test = data["test"]

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=X_train,
        eval_dataset=X_val,
    )

    # Fine-tune on train+val of current dataset
    trainer.train()

    # Evaluate on current dataset
    y_pred = trainer.predict(X_test)
    y_pred = np.argmax(y_pred.predictions, axis=1)

    print(f"\nClassification Report after {name}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix after {name}:")
    print(confusion_matrix(y_test, y_pred))
    print(f"Weighted F1-score after {name}: {f1_score(y_test, y_pred, average='weighted'):.4f}")

    # Evaluation on all datasets
    print("\n--- Evaluation on all datasets ---")
    results[name] = {}
    for test_name, test_data in datasets.items():
        X_te, y_te = test_data["test"]
        preds = trainer.predict(X_te)
        preds = np.argmax(preds.predictions, axis=1)
        f1 = f1_score(y_te, preds, average="weighted")
        results[name][test_name] = f1
        print(f"Evaluation on {test_name}: Weighted F1 = {f1:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Phase 1: Fine-tuning on Celebrity ===


ValueError: too many values to unpack (expected 2)