# **BERT**

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # "0" o "1"

In [2]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
from utils import *

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from datasets import Dataset

from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
)

  from .autonotebook import tqdm as notebook_tqdm
2025-10-28 18:45:00.979102: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-28 18:45:01.041617: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-28 18:45:02.354319: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [4]:
# --------------------
# Build model function
# --------------------

def build_model(learning_rate=2e-5, weight_decay=0.1):
    """
    Builds a BERT model (bert-base-cased) for sequence classification.

    Args:
        learning_rate (float): Learning rate for the optimizer.
        weight_decay (float): Weight decay for AdamW optimizer.

    Returns:
        model (BertForSequenceClassification): HuggingFace BERT model.
        tokenizer (BertTokenizer): Tokenizer associated with the model.
        train_args (TrainingArguments): Default training configuration.
    """
    
    model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

    train_args = TrainingArguments(
        output_dir="./bert_finetune_output",
        eval_strategy="epoch",
        save_strategy="no",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        logging_dir="./logs",
        load_best_model_at_end=False,
        logging_steps=50,
        seed=42,
    )

    return model, tokenizer, train_args

In [5]:
# ----------------------
# Tokenization functions
# ----------------------

def tokenize_function(examples, tokenizer, max_len=128):
    """
    Tokenizes the input examples using the provided tokenizer.

    Args:
        examples (dict): A dictionary containing the text data to be tokenized.
        tokenizer (BertTokenizer): The tokenizer to use for tokenization.
        max_len (int): Maximum length for padding/truncation.

    Returns:
        dict: Tokenized inputs with padding and truncation applied.
    """

    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_len,
    )


def tokenize_datasets(datasets, tokenizer):
    """
    Tokenizes multiple datasets using the provided tokenizer.

    Args:
        datasets (dict): A dictionary where keys are dataset names and values are dictionaries with 'train', 'val', and 'test' splits.
        tokenizer (BertTokenizer): The tokenizer to use for tokenization.

    Returns:
        dict: A dictionary with the same keys as input datasets, but with tokenized datasets.
    """

    tokenized_datasets = {}
    
    for name, data in datasets.items():
        print(f"\n=== Tokenizing dataset: {name} ===")

        train_dataset = Dataset.from_dict({"text": data["train"][0], "label": data["train"][1].astype(int)})
        val_dataset = Dataset.from_dict({"text": data["val"][0], "label": data["val"][1].astype(int)})
        test_dataset = Dataset.from_dict({"text": data["test"][0], "label": data["test"][1].astype(int)})

        train_tokenized = train_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
        val_tokenized = val_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
        test_tokenized = test_dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)

        tokenized_datasets[name] = {
            "train": (train_tokenized, np.array(train_dataset["label"])),
            "val": (val_tokenized, np.array(val_dataset["label"])),
            "test": (test_tokenized, np.array(test_dataset["label"]))
        }

    return tokenized_datasets

## VERSION 1: Dataset (Simple)

In [6]:
dataset_df = data_loading() # load all datasets

for name, df in dataset_df.items():
    print(f"Name: {name}, Number of samples: {len(df)}")

dataset_df = dict(list(dataset_df.items())[:5])

print("\nSplitting datasets into train/val/test...")
datasets = {name: split_dataset(df) for name, df in dataset_df.items()} # split all datasets in train/val/test

model, tokenizer, train_args = build_model()

print("\nComputing tokenized datasets...")
datasets =  tokenize_datasets(datasets, tokenizer) # tokenize all datasets

  dfKaggleMeg['date'] = pd.to_datetime(dfKaggleMeg['date'], errors='coerce') # convert date column to datetime, coerce errors to NaT


Name: Celebrity, Number of samples: 500
Name: CIDII, Number of samples: 722
Name: FaKES, Number of samples: 842
Name: FakeVsSatire, Number of samples: 486
Name: Horne, Number of samples: 326
Name: Infodemic, Number of samples: 10559
Name: ISOT, Number of samples: 44271
Name: Kaggle_clement, Number of samples: 39105
Name: Kaggle_meg, Number of samples: 12845
Name: LIAR_PLUS, Number of samples: 12784
Name: Politifact, Number of samples: 504
Name: Unipi_NDF, Number of samples: 554

Splitting datasets into train/val/test...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Computing tokenized datasets...

=== Tokenizing dataset: Celebrity ===


Map: 100%|██████████| 300/300 [00:01<00:00, 157.34 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 200.41 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 171.79 examples/s]



=== Tokenizing dataset: CIDII ===


Map: 100%|██████████| 433/433 [00:00<00:00, 592.02 examples/s]
Map: 100%|██████████| 144/144 [00:00<00:00, 594.65 examples/s]
Map: 100%|██████████| 145/145 [00:00<00:00, 565.20 examples/s]



=== Tokenizing dataset: FaKES ===


Map: 100%|██████████| 505/505 [00:02<00:00, 234.25 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 212.87 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 221.28 examples/s]



=== Tokenizing dataset: FakeVsSatire ===


Map: 100%|██████████| 291/291 [00:01<00:00, 152.04 examples/s]
Map: 100%|██████████| 97/97 [00:00<00:00, 185.75 examples/s]
Map: 100%|██████████| 98/98 [00:00<00:00, 159.95 examples/s]



=== Tokenizing dataset: Horne ===


Map: 100%|██████████| 195/195 [00:01<00:00, 113.67 examples/s]
Map: 100%|██████████| 65/65 [00:00<00:00, 110.88 examples/s]
Map: 100%|██████████| 66/66 [00:00<00:00, 120.68 examples/s]


In [7]:
# --------------------------------
# Fine-tuning on multiple datasets
# --------------------------------

results = {}

for i, (name, data) in enumerate(datasets.items()):
    print(f"\n=== Phase {i+1}: Fine-tuning on {name} ===")

    X_train, y_train = data["train"]
    X_val, y_val = data["val"]
    X_test, y_test = data["test"]

    # define trainer
    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=X_train,
        eval_dataset=X_val,
    )

    # fine-tune on train + val
    trainer.train()

    # evaluate on current dataset
    y_pred = trainer.predict(X_test)
    y_pred = np.argmax(y_pred.predictions, axis=1)
    print(f"\nClassification Report after {name}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix after {name}:")
    print(confusion_matrix(y_test, y_pred))
    print(f"Weighted F1-score after {name}: {f1_score(y_test, y_pred, average='weighted'):.4f}")

    # evaluate on all datasets
    print("\n--- Evaluation on all datasets ---")
    results[name] = {}
    for test_name, test_data in datasets.items():
        X_te, y_te = test_data["test"]
        preds = trainer.predict(X_te)
        preds = np.argmax(preds.predictions, axis=1)
        f1 = f1_score(y_te, preds, average="weighted")
        results[name][test_name] = f1
        print(f"Evaluation on {test_name}: Weighted F1 = {f1:.4f}")



=== Phase 1: Fine-tuning on Celebrity ===


Epoch,Training Loss,Validation Loss
1,No log,0.629855
2,0.666700,0.480792
3,0.394700,0.485651



Classification Report after Celebrity:
              precision    recall  f1-score   support

           0       0.80      0.70      0.74        50
           1       0.73      0.82      0.77        50

    accuracy                           0.76       100
   macro avg       0.76      0.76      0.76       100
weighted avg       0.76      0.76      0.76       100

Confusion Matrix after Celebrity:
[[35 15]
 [ 9 41]]
Weighted F1-score after Celebrity: 0.7591

--- Evaluation on all datasets ---


Evaluation on Celebrity: Weighted F1 = 0.7591


Evaluation on CIDII: Weighted F1 = 0.7642


Evaluation on FaKES: Weighted F1 = 0.3764


Evaluation on FakeVsSatire: Weighted F1 = 0.5944


Evaluation on Horne: Weighted F1 = 0.6111

=== Phase 2: Fine-tuning on CIDII ===


Epoch,Training Loss,Validation Loss
1,0.3355,0.176794
2,0.1054,0.150118
3,0.0168,0.15315



Classification Report after CIDII:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96        85
           1       0.95      0.93      0.94        60

    accuracy                           0.95       145
   macro avg       0.95      0.95      0.95       145
weighted avg       0.95      0.95      0.95       145

Confusion Matrix after CIDII:
[[82  3]
 [ 4 56]]
Weighted F1-score after CIDII: 0.9517

--- Evaluation on all datasets ---


Evaluation on Celebrity: Weighted F1 = 0.4415


Evaluation on CIDII: Weighted F1 = 0.9517


Evaluation on FaKES: Weighted F1 = 0.2990


Evaluation on FakeVsSatire: Weighted F1 = 0.5135


Evaluation on Horne: Weighted F1 = 0.3276

=== Phase 3: Fine-tuning on FaKES ===


Epoch,Training Loss,Validation Loss
1,0.8365,0.694867
2,0.7011,0.73085
3,0.6647,0.724165



Classification Report after FaKES:
              precision    recall  f1-score   support

           0       0.53      0.71      0.61        89
           1       0.49      0.31      0.38        80

    accuracy                           0.52       169
   macro avg       0.51      0.51      0.50       169
weighted avg       0.51      0.52      0.50       169

Confusion Matrix after FaKES:
[[63 26]
 [55 25]]
Weighted F1-score after FaKES: 0.5012

--- Evaluation on all datasets ---


Evaluation on Celebrity: Weighted F1 = 0.6578


Evaluation on CIDII: Weighted F1 = 0.7827


Evaluation on FaKES: Weighted F1 = 0.5012


Evaluation on FakeVsSatire: Weighted F1 = 0.6552


Evaluation on Horne: Weighted F1 = 0.5690

=== Phase 4: Fine-tuning on FakeVsSatire ===


Epoch,Training Loss,Validation Loss
1,No log,0.547658
2,0.632600,0.603779
3,0.249900,0.619124



Classification Report after FakeVsSatire:
              precision    recall  f1-score   support

           0       0.90      0.46      0.61        41
           1       0.71      0.96      0.82        57

    accuracy                           0.76        98
   macro avg       0.81      0.71      0.72        98
weighted avg       0.79      0.76      0.73        98

Confusion Matrix after FakeVsSatire:
[[19 22]
 [ 2 55]]
Weighted F1-score after FakeVsSatire: 0.7339

--- Evaluation on all datasets ---


Evaluation on Celebrity: Weighted F1 = 0.5685


Evaluation on CIDII: Weighted F1 = 0.9517


Evaluation on FaKES: Weighted F1 = 0.4075


Evaluation on FakeVsSatire: Weighted F1 = 0.7339


Evaluation on Horne: Weighted F1 = 0.5662

=== Phase 5: Fine-tuning on Horne ===


Epoch,Training Loss,Validation Loss
1,No log,0.524387
2,0.476000,0.532743
3,0.476000,0.53142



Classification Report after Horne:
              precision    recall  f1-score   support

           0       0.76      0.85      0.80        41
           1       0.70      0.56      0.62        25

    accuracy                           0.74        66
   macro avg       0.73      0.71      0.71        66
weighted avg       0.74      0.74      0.74        66

Confusion Matrix after Horne:
[[35  6]
 [11 14]]
Weighted F1-score after Horne: 0.7355

--- Evaluation on all datasets ---


Evaluation on Celebrity: Weighted F1 = 0.7106


Evaluation on CIDII: Weighted F1 = 0.9301


Evaluation on FaKES: Weighted F1 = 0.3795


Evaluation on FakeVsSatire: Weighted F1 = 0.8262


Evaluation on Horne: Weighted F1 = 0.7355


In [8]:
# ---------------
# Results summary
# ---------------

print("\n=== Results Summary ===")
for name, res in results.items():
    print(f"\nResults after training on {name}:")
    for test_name, f1 in res.items():
        print(f"  Test on {test_name}: Weighted F1 = {f1:.4f}")


=== Results Summary ===

Results after training on Celebrity:
  Test on Celebrity: Weighted F1 = 0.7591
  Test on CIDII: Weighted F1 = 0.7642
  Test on FaKES: Weighted F1 = 0.3764
  Test on FakeVsSatire: Weighted F1 = 0.5944
  Test on Horne: Weighted F1 = 0.6111

Results after training on CIDII:
  Test on Celebrity: Weighted F1 = 0.4415
  Test on CIDII: Weighted F1 = 0.9517
  Test on FaKES: Weighted F1 = 0.2990
  Test on FakeVsSatire: Weighted F1 = 0.5135
  Test on Horne: Weighted F1 = 0.3276

Results after training on FaKES:
  Test on Celebrity: Weighted F1 = 0.6578
  Test on CIDII: Weighted F1 = 0.7827
  Test on FaKES: Weighted F1 = 0.5012
  Test on FakeVsSatire: Weighted F1 = 0.6552
  Test on Horne: Weighted F1 = 0.5690

Results after training on FakeVsSatire:
  Test on Celebrity: Weighted F1 = 0.5685
  Test on CIDII: Weighted F1 = 0.9517
  Test on FaKES: Weighted F1 = 0.4075
  Test on FakeVsSatire: Weighted F1 = 0.7339
  Test on Horne: Weighted F1 = 0.5662

Results after training 

## VERSION 2: Dataset by Topic

In [9]:
dataset_df = data_by_topic()

for topic, df in dataset_df.items():
    print(f"Topic: {topic}, Number of samples: {len(df)}")

# pop first 3 datasets to reduce computation time
dataset_df = dict(list(dataset_df.items())[3:])

print("\nSplitting datasets into train/val/test...")
datasets = {topic: split_dataset(df) for topic, df in dataset_df.items()} # split all datasets in train/val/test

model, tokenizer, train_args = build_model()

print("\nComputing tokenized datasets...")
datasets =  tokenize_datasets(datasets, tokenizer) # tokenize all datasets

  dfKaggleMeg['date'] = pd.to_datetime(dfKaggleMeg['date'], errors='coerce') # convert date column to datetime, coerce errors to NaT


Topic: politics, Number of samples: 97476
Topic: general, Number of samples: 12845
Topic: covid, Number of samples: 10559
Topic: syria, Number of samples: 842
Topic: islam, Number of samples: 722
Topic: notredame, Number of samples: 554
Topic: gossip, Number of samples: 500

Splitting datasets into train/val/test...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Computing tokenized datasets...

=== Tokenizing dataset: syria ===


Map: 100%|██████████| 505/505 [00:02<00:00, 235.47 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 214.02 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 220.66 examples/s]



=== Tokenizing dataset: islam ===


Map: 100%|██████████| 433/433 [00:00<00:00, 608.58 examples/s]
Map: 100%|██████████| 144/144 [00:00<00:00, 602.81 examples/s]
Map: 100%|██████████| 145/145 [00:00<00:00, 574.59 examples/s]



=== Tokenizing dataset: notredame ===


Map: 100%|██████████| 332/332 [00:00<00:00, 872.07 examples/s]
Map: 100%|██████████| 111/111 [00:00<00:00, 893.67 examples/s]
Map: 100%|██████████| 111/111 [00:00<00:00, 832.93 examples/s]



=== Tokenizing dataset: gossip ===


Map: 100%|██████████| 300/300 [00:01<00:00, 161.69 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 210.31 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 178.26 examples/s]


In [10]:
# -------------------------------
# Fine-tuning on Dataset by Topic
# -------------------------------

results = {}

# sequential training
for i, (topic, data) in enumerate(datasets.items()):
    print(f"\n=== Phase {i+1}: Training/Fine-tuning on topic: {topic} ===")

    X_train, y_train = data["train"]
    X_val, y_val = data["val"]
    X_test, y_test = data["test"]

    # define trainer
    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=X_train,
        eval_dataset=X_val,
    )

    # fine-tune on train + val
    trainer.train()

    # evaluate on current dataset
    y_pred = trainer.predict(X_test)
    y_pred = np.argmax(y_pred.predictions, axis=1)
    print(f"\nClassification Report after topic {topic}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix after topic {topic}:")
    print(confusion_matrix(y_test, y_pred))
    print(f"Weighted F1-score after topic {topic}:", f1_score(y_test, y_pred, average='weighted'))

    # evaluate on all datasets
    print("\n--- Evaluation on all datasets ---")
    results[topic] = {}
    for test_topic, test_data in datasets.items(): # for each topic
        X_te, y_te = test_data["test"]
        preds = trainer.predict(X_te)
        preds = np.argmax(preds.predictions, axis=1)
        f1 = f1_score(y_te, preds, average="weighted")
        results[topic][test_topic] = f1
        print(f"Evaluation on topic {test_topic}: Weighted F1 = {f1:.4f}")


=== Phase 1: Training/Fine-tuning on topic: syria ===


Epoch,Training Loss,Validation Loss
1,0.6977,0.691831
2,0.7067,0.69135
3,0.7069,0.690393



Classification Report after topic syria:
              precision    recall  f1-score   support

           0       0.52      0.97      0.68        89
           1       0.40      0.03      0.05        80

    accuracy                           0.52       169
   macro avg       0.46      0.50      0.36       169
weighted avg       0.47      0.52      0.38       169

Confusion Matrix after topic syria:
[[86  3]
 [78  2]]
Weighted F1-score after topic syria: 0.38029961382312355

--- Evaluation on all datasets ---


Evaluation on topic syria: Weighted F1 = 0.3803


Evaluation on topic islam: Weighted F1 = 0.5613


Evaluation on topic notredame: Weighted F1 = 0.5271


Evaluation on topic gossip: Weighted F1 = 0.3686

=== Phase 2: Training/Fine-tuning on topic: islam ===


Epoch,Training Loss,Validation Loss
1,0.521,0.259663
2,0.2258,0.123075
3,0.0763,0.195271



Classification Report after topic islam:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        85
           1       0.98      0.95      0.97        60

    accuracy                           0.97       145
   macro avg       0.97      0.97      0.97       145
weighted avg       0.97      0.97      0.97       145

Confusion Matrix after topic islam:
[[84  1]
 [ 3 57]]
Weighted F1-score after topic islam: 0.972340396612888

--- Evaluation on all datasets ---


Evaluation on topic syria: Weighted F1 = 0.3042


Evaluation on topic islam: Weighted F1 = 0.9723


Evaluation on topic notredame: Weighted F1 = 0.2163


Evaluation on topic gossip: Weighted F1 = 0.3552

=== Phase 3: Training/Fine-tuning on topic: notredame ===


Epoch,Training Loss,Validation Loss
1,No log,0.26901
2,0.538500,0.217354
3,0.220100,0.234352



Classification Report after topic notredame:
              precision    recall  f1-score   support

           0       0.89      0.97      0.93        68
           1       0.95      0.81      0.88        43

    accuracy                           0.91       111
   macro avg       0.92      0.89      0.90       111
weighted avg       0.91      0.91      0.91       111

Confusion Matrix after topic notredame:
[[66  2]
 [ 8 35]]
Weighted F1-score after topic notredame: 0.9084348432939982

--- Evaluation on all datasets ---


Evaluation on topic syria: Weighted F1 = 0.3016


Evaluation on topic islam: Weighted F1 = 0.9450


Evaluation on topic notredame: Weighted F1 = 0.9084


Evaluation on topic gossip: Weighted F1 = 0.5072

=== Phase 4: Training/Fine-tuning on topic: gossip ===


Epoch,Training Loss,Validation Loss
1,No log,0.525492
2,0.609900,0.466328
3,0.348600,0.536904



Classification Report after topic gossip:
              precision    recall  f1-score   support

           0       0.81      0.70      0.75        50
           1       0.74      0.84      0.79        50

    accuracy                           0.77       100
   macro avg       0.78      0.77      0.77       100
weighted avg       0.78      0.77      0.77       100

Confusion Matrix after topic gossip:
[[35 15]
 [ 8 42]]
Weighted F1-score after topic gossip: 0.7688674505074866

--- Evaluation on all datasets ---


Evaluation on topic syria: Weighted F1 = 0.5193


Evaluation on topic islam: Weighted F1 = 0.8565


Evaluation on topic notredame: Weighted F1 = 0.7845


Evaluation on topic gossip: Weighted F1 = 0.7689


In [11]:
# ---------------
# Results summary
# ---------------

print("\n=== Results Summary ===")
for topic, res in results.items():
    print(f"\nResults after training on topic {topic}:")
    for test_topic, f1 in res.items():
        print(f"  Test on topic {test_topic}: Weighted F1 = {f1:.4f}")


=== Results Summary ===

Results after training on topic syria:
  Test on topic syria: Weighted F1 = 0.3803
  Test on topic islam: Weighted F1 = 0.5613
  Test on topic notredame: Weighted F1 = 0.5271
  Test on topic gossip: Weighted F1 = 0.3686

Results after training on topic islam:
  Test on topic syria: Weighted F1 = 0.3042
  Test on topic islam: Weighted F1 = 0.9723
  Test on topic notredame: Weighted F1 = 0.2163
  Test on topic gossip: Weighted F1 = 0.3552

Results after training on topic notredame:
  Test on topic syria: Weighted F1 = 0.3016
  Test on topic islam: Weighted F1 = 0.9450
  Test on topic notredame: Weighted F1 = 0.9084
  Test on topic gossip: Weighted F1 = 0.5072

Results after training on topic gossip:
  Test on topic syria: Weighted F1 = 0.5193
  Test on topic islam: Weighted F1 = 0.8565
  Test on topic notredame: Weighted F1 = 0.7845
  Test on topic gossip: Weighted F1 = 0.7689


## VERSION 3: Dataset by Date

In [12]:
dataset_df = data_by_date()

for date, df in dataset_df.items():
    print(f"Date: {date}, Number of samples: {len(df)}")

# pop last 3 datasets to reduce computation time
dataset_df = dict(list(dataset_df.items())[:3])

print("\nSplitting datasets into train/val/test...")
datasets = {date: split_dataset(df) for date, df in dataset_df.items()} # split all datasets in train/val/test

model, tokenizer, train_args = build_model()

print("\nComputing tokenized datasets...")
datasets =  tokenize_datasets(datasets, tokenizer) # tokenize all datasets

  dfKaggleMeg['date'] = pd.to_datetime(dfKaggleMeg['date'], errors='coerce') # convert date column to datetime, coerce errors to NaT


Date: 2011-2013, Number of samples: 55
Date: 2014, Number of samples: 114
Date: 2015, Number of samples: 84
Date: 2016, Number of samples: 49687
Date: 2017, Number of samples: 16657
Date: 2020, Number of samples: 10559

Splitting datasets into train/val/test...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Computing tokenized datasets...

=== Tokenizing dataset: 2011-2013 ===


Map: 100%|██████████| 33/33 [00:00<00:00, 187.49 examples/s]
Map: 100%|██████████| 11/11 [00:00<00:00, 177.44 examples/s]
Map: 100%|██████████| 11/11 [00:00<00:00, 229.09 examples/s]



=== Tokenizing dataset: 2014 ===


Map: 100%|██████████| 68/68 [00:00<00:00, 293.33 examples/s]
Map: 100%|██████████| 23/23 [00:00<00:00, 256.83 examples/s]
Map: 100%|██████████| 23/23 [00:00<00:00, 246.43 examples/s]



=== Tokenizing dataset: 2015 ===


Map: 100%|██████████| 50/50 [00:00<00:00, 239.47 examples/s]
Map: 100%|██████████| 17/17 [00:00<00:00, 201.21 examples/s]
Map: 100%|██████████| 17/17 [00:00<00:00, 164.86 examples/s]


In [13]:
# ------------------------------
# Fine-tuning on Dataset by Date
# ------------------------------

results = {}

# sequential training
for i, (date, data) in enumerate(datasets.items()):
    print(f"\n=== Phase {i+1}: Training/Fine-tuning on date: {date} ===")
    
    X_train, y_train = data["train"]
    X_val, y_val = data["val"]
    X_test, y_test = data["test"]

    # define trainer
    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=X_train,
        eval_dataset=X_val,
    )
    
    # fine-tune on train + val
    trainer.train()

    # evaluate on current dataset
    y_pred = trainer.predict(X_test)
    y_pred = np.argmax(y_pred.predictions, axis=1)
    print(f"\nClassification Report after date {date}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix after date {date}:")
    print(confusion_matrix(y_test, y_pred))
    print(f"Weighted F1-score after date {date}:", f1_score(y_test, y_pred, average='weighted'))

    # evaluate on all datasets
    print("\n--- Evaluation on all datasets ---")
    results[date] = {}
    for test_date, test_data in datasets.items(): # for each date
        X_te, y_te = test_data["test"]
        preds = trainer.predict(X_te)
        preds = np.argmax(preds.predictions, axis=1)
        f1 = f1_score(y_te, preds, average="weighted")
        results[date][test_date] = f1
        print(f"Evaluation on date {test_date}: Weighted F1 = {f1:.4f}")


=== Phase 1: Training/Fine-tuning on date: 2011-2013 ===


Epoch,Training Loss,Validation Loss
1,No log,0.697632
2,No log,0.702738
3,No log,0.710625



Classification Report after date 2011-2013:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.50      0.83      0.62         6

    accuracy                           0.45        11
   macro avg       0.25      0.42      0.31        11
weighted avg       0.27      0.45      0.34        11

Confusion Matrix after date 2011-2013:
[[0 5]
 [1 5]]
Weighted F1-score after date 2011-2013: 0.3409090909090909

--- Evaluation on all datasets ---


Evaluation on date 2011-2013: Weighted F1 = 0.3409


Evaluation on date 2014: Weighted F1 = 0.2899


Evaluation on date 2015: Weighted F1 = 0.3088

=== Phase 2: Training/Fine-tuning on date: 2014 ===


Epoch,Training Loss,Validation Loss
1,No log,0.694153
2,No log,0.692683
3,No log,0.689892



Classification Report after date 2014:
              precision    recall  f1-score   support

           0       0.61      0.92      0.73        12
           1       0.80      0.36      0.50        11

    accuracy                           0.65        23
   macro avg       0.71      0.64      0.62        23
weighted avg       0.70      0.65      0.62        23

Confusion Matrix after date 2014:
[[11  1]
 [ 7  4]]
Weighted F1-score after date 2014: 0.6217391304347826

--- Evaluation on all datasets ---


Evaluation on date 2011-2013: Weighted F1 = 0.5221


Evaluation on date 2014: Weighted F1 = 0.6217


Evaluation on date 2015: Weighted F1 = 0.5092

=== Phase 3: Training/Fine-tuning on date: 2015 ===


Epoch,Training Loss,Validation Loss
1,No log,0.700266
2,No log,0.708424
3,No log,0.712617



Classification Report after date 2015:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.53      1.00      0.69         9

    accuracy                           0.53        17
   macro avg       0.26      0.50      0.35        17
weighted avg       0.28      0.53      0.37        17

Confusion Matrix after date 2015:
[[0 8]
 [0 9]]
Weighted F1-score after date 2015: 0.36651583710407243

--- Evaluation on all datasets ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Evaluation on date 2011-2013: Weighted F1 = 0.3850


Evaluation on date 2014: Weighted F1 = 0.2899


Evaluation on date 2015: Weighted F1 = 0.3665


In [14]:
# ---------------
# Results summary
# ---------------

print("\n=== Results Summary ===")
for date, res in results.items():
    print(f"\nResults after training on date {date}:")
    for test_date, f1 in res.items():
        print(f"  Test on date {test_date}: Weighted F1 = {f1:.4f}")


=== Results Summary ===

Results after training on date 2011-2013:
  Test on date 2011-2013: Weighted F1 = 0.3409
  Test on date 2014: Weighted F1 = 0.2899
  Test on date 2015: Weighted F1 = 0.3088

Results after training on date 2014:
  Test on date 2011-2013: Weighted F1 = 0.5221
  Test on date 2014: Weighted F1 = 0.6217
  Test on date 2015: Weighted F1 = 0.5092

Results after training on date 2015:
  Test on date 2011-2013: Weighted F1 = 0.3850
  Test on date 2014: Weighted F1 = 0.2899
  Test on date 2015: Weighted F1 = 0.3665
