# **CNN-BERT (FakeBERT)**

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # "0" o "1"

In [2]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print("Using GPU:", gpus[0])
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found, using CPU.")


2025-10-27 17:21:34.818176: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-27 17:21:34.883401: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-27 17:21:36.173194: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Using GPU: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [3]:
from utils import *

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from transformers import BertTokenizer, TFBertModel

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ------------------------------
# Preprocessing and tokenization
# ------------------------------

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
bert_model = TFBertModel.from_pretrained("bert-base-cased", from_pt=True)
max_len = 128


def get_bert_embeddings(texts, batch_size=16):
    """
    Get BERT embeddings for a list of texts.

    Args:
        texts: List or array of input texts

    Returns:
        Numpy array of BERT embeddings with shape (num_texts, max_len, 768)
    """
    
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size].tolist()
        input_enc = tokenizer(
            batch_texts,
            truncation=True,
            padding=True,
            max_length=max_len,
            return_tensors='tf'
        )
        outputs = bert_model(input_enc)
        batch_emb = outputs.last_hidden_state  # (batch, max_len, 768)
        embeddings.append(batch_emb.numpy())

        # libera memoria GPU tra un batch e l’altro
        del input_enc, outputs, batch_emb
        tf.keras.backend.clear_session()

    return np.concatenate(embeddings, axis=0)

I0000 00:00:1761582101.610984 3827518 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 28198 MB memory:  -> device: 0, name: Tesla V100S-PCIE-32GB, pci bus id: 0000:af:00.0, compute capability: 7.0
TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining 

In [5]:
# ------------------------------
# Build model function
# ------------------------------

def build_model(max_len=128, cnn_filters=96, kernel_size=4,
                dense_units=32, learning_rate=1e-4):
    """
    Builds a CNN model on top of BERT embeddings (not end-to-end fine-tuning).

    Args:
        max_len (int): Maximum sequence length.
        cnn_filters (int): Number of filters in Conv1D layer.
        kernel_size (int): Size of convolution kernel.
        dense_units (int): Units in dense hidden layer.
        learning_rate (float): Learning rate for Adam optimizer.

    Returns:
        model (tf.keras.Model): Compiled CNN-BERT model.
    """
    model = Sequential([
        Input(shape=(max_len, 768)),  # BERT base hidden size
        Conv1D(filters=cnn_filters, kernel_size=kernel_size, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(dense_units, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

## VERSION 1: Dataset (Simple)

In [6]:
dataset_df = data_loading() # load datasets

for name, df in dataset_df.items():
    print(f"Dataset: {name}, Number of samples: {len(df)}")

Dataset: Celebrity, Number of samples: 500
Dataset: CIDII, Number of samples: 722
Dataset: FaKES, Number of samples: 842
Dataset: FakeVsSatire, Number of samples: 486
Dataset: Horne, Number of samples: 326
Dataset: Infodemic, Number of samples: 10559
Dataset: ISOT, Number of samples: 44271
Dataset: Kaggle_clement, Number of samples: 39105
Dataset: Kaggle_meg, Number of samples: 12845
Dataset: LIAR_PLUS, Number of samples: 12784
Dataset: Politifact, Number of samples: 504
Dataset: Unipi_NDF, Number of samples: 554


  dfKaggleMeg['date'] = pd.to_datetime(dfKaggleMeg['date'], errors='coerce') # convert date column to datetime, coerce errors to NaT


In [7]:
# --------------------------------
# Fine-tuning on multiple datasets
# --------------------------------

datasets = {name: split_dataset(df) for name, df in dataset_df.items()} # split all datasets in train/val/test
model = build_model() # initialize model

results = {}

# sequential training
for i, (name, data) in enumerate(datasets.items()):
    print(f"\n=== Phase {i+1}: Training/Fine-tuning on {name} ===")
    
    X_train, y_train = data["train"]
    X_val, y_val = data["val"]
    X_test, y_test = data["test"]

    # get BERT embeddings
    X_train_emb = get_bert_embeddings(X_train)
    X_val_emb = get_bert_embeddings(X_val)
    X_test_emb = get_bert_embeddings(X_test)

    # early stopping
    es = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, verbose=0)

    # fine-tune on train + val
    model.fit(
        np.concatenate([X_train_emb, X_val_emb]),
        np.concatenate([y_train, y_val]),
        epochs=10,
        batch_size=16,
        validation_split=0.1,
        callbacks=[es],
        verbose=1
    )

    y_pred = model.predict(X_test_emb)
    y_pred = (y_pred > 0.5).astype(int)
    print(f"Classification Report after {name}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix after {name}:")
    print(confusion_matrix(y_test, y_pred))
    print(f"\nWeighted F1-score after {name}:", f1_score(y_test, y_pred, average="weighted"))


    # evaluation on all datasets
    print("\n--- Evaluation on all datasets ---")
    results[name] = {}
    for test_name, test_data in datasets.items(): # for each dataset
        X_te, y_te = test_data["test"]
        preds = model.predict(get_bert_embeddings(X_te))
        preds = (preds > 0.5).astype(int)
        f1 = f1_score(y_te, preds, average="weighted")
        results[name][test_name] = f1
        print(f"Evaluation on {test_name}: Weighted F1 = {f1:.4f}")


TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.



=== Phase 1: Training/Fine-tuning on Celebrity ===
Epoch 1/10


2025-10-27 17:21:53.381156: I external/local_xla/xla/service/service.cc:163] XLA service 0x75ac180040b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-10-27 17:21:53.381190: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): Tesla V100S-PCIE-32GB, Compute Capability 7.0
2025-10-27 17:21:53.406888: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-10-27 17:21:53.564437: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91002


[1m16/23[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 7ms/step - accuracy: 0.4096 - loss: 1.6957

I0000 00:00:1761582114.743596 3828288 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 99ms/step - accuracy: 0.4611 - loss: 1.0794 - val_accuracy: 0.4500 - val_loss: 0.7397
Epoch 2/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.5500 - loss: 0.6930 - val_accuracy: 0.6250 - val_loss: 0.6596
Epoch 3/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.6556 - loss: 0.6070 - val_accuracy: 0.5750 - val_loss: 0.6586
Epoch 4/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7556 - loss: 0.5462 - val_accuracy: 0.7000 - val_loss: 0.6269
Epoch 5/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8139 - loss: 0.4907 - val_accuracy: 0.7000 - val_loss: 0.6180
Epoch 6/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8833 - loss: 0.4286 - val_accuracy: 0.7000 - val_loss: 0.6073
Epoch 7/10
[1m23/23[0m [32m━━━━━━━━━━━━━━━

2025-10-27 17:23:19.673921: W external/local_xla/xla/tsl/framework/bfc_allocator.cc:501] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.24GiB (rounded to 3481927680)requested by op Sub
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2025-10-27 17:23:19.673998: I external/local_xla/xla/tsl/framework/bfc_allocator.cc:1049] BFCAllocator dump for GPU_0_bfc
2025-10-27 17:23:19.674025: I external/local_xla/xla/tsl/framework/bfc_allocator.cc:1056] Bin (256): 	Total Chunks: 131, Chunks in use: 129. 32.8KiB allocated for chunks. 32.2KiB in use in bin. 1.3KiB client-requested in use in bin.
2025-10-27 17:23:19.674038: I external/local_xla/xla/tsl/framework/bfc_allocator.cc:1056] Bin (512): 	Total Chunks: 5, Chunks in use: 4. 2.5KiB allocated for chunks. 2.0KiB in use in bin. 1.6KiB client-requested in use in bin.
2025-10-27 17

ResourceExhaustedError: Exception encountered when calling layer 'LayerNorm' (type LayerNormalization).

{{function_node __wrapped__Sub_device_/job:localhost/replica:0/task:0/device:GPU:0}} failed to allocate memory [Op:Sub] name: 

Call arguments received by layer 'LayerNorm' (type LayerNormalization):
  • inputs=tf.Tensor(shape=(8855, 128, 768), dtype=float32)

In [None]:
# ---------------
# Results summary
# ---------------

print("\n=== Results Summary ===")
for name, res in results.items():
    print(f"\nResults after training on {name}:")
    for test_name, f1 in res.items():
        print(f"  Test on {test_name}: Weighted F1 = {f1:.4f}")