# **BERT**

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import optuna
import gc

  from .autonotebook import tqdm as notebook_tqdm
2025-10-22 15:48:37.435721: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-22 15:48:37.493556: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-22 15:48:38.771766: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [None]:
# ----------------
# Load dataset
# ----------------

path_file = "datasets/Unipi_NDF/df_ndf.csv"
df = pd.read_csv(path_file, sep="\t", encoding="utf-8")
df.head()

texts = df["texts"].astype(str).tolist()
labels = df["labels"].values

In [4]:
# ----------------
# Load dataset
# ----------------

path_file = "datasets/Unipi_NDF/df_ndf.csv"
df = pd.read_csv(path_file, sep="\t", encoding="utf-8")
df.head()

texts = df["texts"].astype(str).tolist()
labels = df["labels"].values

In [5]:
# ------------------------------
# Preprocessing and tokenization
# ------------------------------

# split train/validation/test (0.6 / 0.2 / 0.2): stratify to maintain label distribution
X_train, X_temp, y_train, y_temp = train_test_split(texts, labels, test_size=0.4, stratify=labels, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
y_train = y_train.astype('int64')
y_val = y_val.astype('int64')
y_test = y_test.astype('int64')
print("Train labels:", set(y_train))
print("Val labels:", set(y_val))
print("Test labels:", set(y_test))


max_len = 128

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

def tokenize_texts(texts):
    """ Tokenize a list of texts using BERT tokenizer. """
    return tokenizer(
        texts, truncation=True, padding=True, max_length=max_len
    )

train_enc = tokenize_texts(X_train)
val_enc = tokenize_texts(X_val)
test_enc = tokenize_texts(X_test)

train_enc = Dataset.from_dict({
    'input_ids': train_enc['input_ids'],
    'attention_mask': train_enc['attention_mask'],
    'labels': y_train
})

val_enc = Dataset.from_dict({
    'input_ids': val_enc['input_ids'],
    'attention_mask': val_enc['attention_mask'],
    'labels': y_val
})

test_enc = Dataset.from_dict({
    'input_ids': test_enc['input_ids'],
    'attention_mask': test_enc['attention_mask'],
    'labels': y_test
})

Train labels: {0, 1}
Val labels: {0, 1}
Test labels: {0, 1}


In [6]:
# ----------------------------
# Evaluation metrics
# ----------------------------

def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    f1 = f1_score(pred.label_ids, preds)
    return {"f1": f1}

In [13]:
# ----------------------------
# Optuna objective function
# ----------------------------

def objectiveBERT(trial):
    learning_rate = trial.suggest_categorical("learning_rate", [4e-5, 2e-5, 3e-2])
    weight_decay = trial.suggest_categorical("weight_decay", [0.001, 0.01, 0.1, None])

    model = BertForSequenceClassification.from_pretrained(
            "bert-base-cased",
            num_labels=2
        )
    model.to(device)
    
    training_args = TrainingArguments(
        output_dir=f"./bert_fake_news_{trial.number}",
        eval_strategy="epoch", # Changed from evaluation_strategy
        save_strategy="epoch",
        learning_rate=learning_rate,
        weight_decay=weight_decay if weight_decay is not None else 0.0,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        num_train_epochs=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        logging_dir=f"./logs_bert_fake_news_{trial.number}",
        logging_strategy="epoch",
        save_total_limit=2,
        disable_tqdm=True,
        report_to="none",
        remove_unused_columns=False,
        fp16=False,                     # mixed precision training (meno memoria, più veloce)
        gradient_accumulation_steps=2, # riduce batch effettivo in GPU
        dataloader_num_workers=2,      # numero di worker per il caricamento dei dati
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_enc,
        eval_dataset=val_enc,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    metrics = trainer.evaluate()

    # Libera memoria GPU
    del trainer
    del model
    torch.cuda.empty_cache()
    gc.collect()

    return metrics["eval_f1"]

In [14]:
batch = {k: torch.tensor(v[:2]) for k,v in train_enc[:2].items()}
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
outputs = model(**batch)
print("Forward pass OK")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Forward pass OK


In [15]:
# ----------------------------
# Hyperparameter optimization
# ----------------------------

study = optuna.create_study(direction="maximize") # maximize F1-score
study.optimize(objectiveBERT, n_trials=5) # 5 trials for demonstration; increase for better results

print("Best parameters:", study.best_params)

[I 2025-10-22 15:49:41,903] A new study created in memory with name: no-name-ca733b52-b840-4860-8a5f-3e505e6b0e1a
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[W 2025-10-22 15:49:42,250] Trial 0 failed with parameters: {'learning_rate': 0.03, 'weight_decay': None} because of the following error: AcceleratorError('CUDA error: device-side assert triggered\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n').
Traceback (most recent call last):
  File "/home/n.emmolo/miniconda3/envs/env/lib/python3.10/site-packages/optuna/study/_optimize.py", line 201, in 

AcceleratorError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# ----------------------------
# Final training and evaluation
# ----------------------------

best_params = study.best_params
best_model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=2
)
best_training_args = TrainingArguments(
    output_dir="./bert_fake_news_final",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=best_params["learning_rate"],
    weight_decay=best_params["weight_decay"] if best_params["weight_decay"] is not None else 0.0,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs_bert_fake_news_final",
    logging_strategy="epoch",
    save_total_limit=2,
    fp16=False,
    gradient_accumulation_steps=2,
    dataloader_num_workers=2,
)
best_trainer = Trainer(
    model=best_model,
    args=best_training_args,
    train_dataset=train_enc,
    eval_dataset=val_enc,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
best_trainer.train()

y_pred = np.argmax(best_trainer.predict(test_enc).predictions, axis=1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Weighted F1-score:", f1_score(y_test, y_pred, average="weighted"))