# **CNN-BERT (FakeBERT)**

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
import optuna

In [5]:
# ----------------
# Load dataset
# ----------------

path_file = "datasets/Unipi_NDF/df_ndf.csv"
df = pd.read_csv(path_file, sep="\t", encoding="utf-8")
df.head()

texts = df["texts"].astype(str).tolist()
labels = df["labels"].values

In [10]:
# ------------------------------
# Preprocessing and tokenization
# ------------------------------

# split train/validation/test (0.6 / 0.2 / 0.2): stratify to maintain label distribution
X_train, X_temp, y_train, y_temp = train_test_split(texts, labels, test_size=0.4, stratify=labels, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
bert_model = TFBertModel.from_pretrained("bert-base-cased", from_pt=True)

max_len = 128 # maximum length for BERT inputs

def get_bert_embeddings(texts):
    """ Extract BERT [CLS] embeddings for a list of texts. """
    input_enc = tokenizer(
        texts, truncation=True, padding=True, max_length=max_len, return_tensors='tf'
    )
    outputs = bert_model(input_enc)

    # For each input, get the embeddings of all tokens
    token_embeddings = outputs.last_hidden_state  # shape: (N, max_len, 768)
    return token_embeddings.numpy()

# Extract BERT embeddings for train, validation, and test sets
X_train_emb = get_bert_embeddings(X_train)
X_val_emb = get_bert_embeddings(X_val)
X_test_emb = get_bert_embeddings(X_test)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [13]:
# ----------------------------
# Optuna objective function
# ----------------------------

def objectiveCNNBERT(trial):
    cnn_filters = trial.suggest_categorical("cnn_filters", [64, 96, 128])
    kernel_size = trial.suggest_categorical("kernel_size", [3, 4, 5])
    dense_units = trial.suggest_categorical("dense_units", [16, 32, 64])
    learning_rate = trial.suggest_categorical("learning_rate", [1e-5, 1e-4, 1e-3, 1e-2])

    model = Sequential([
        Input(shape=(max_len, X_train_emb.shape[2])),
        Conv1D(filters=cnn_filters, kernel_size=kernel_size, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(dense_units, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy', metrics=['accuracy'])

    es = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, verbose=0)
    model.fit(X_train_emb, y_train,
              validation_data=(X_val_emb, y_val),
              epochs=50,
              batch_size=8,
              callbacks=[es],
              verbose=0)

    preds_val = (model.predict(X_val_emb) > 0.5).astype(int)
    f1 = f1_score(y_val, preds_val)
    return f1

In [14]:
# ----------------------------
# Hyperparameter optimization
# ----------------------------

study = optuna.create_study(direction="maximize") # maximize F1-score
study.optimize(objectiveCNNBERT, n_trials=20) # 20 trials for demonstration

print("Best parameters:", study.best_params)

[I 2025-10-22 11:49:25,543] A new study created in memory with name: no-name-ace195d6-6aed-43c3-8ec8-6d1dc2a0bd09
2025-10-22 11:49:26.540116: I external/local_xla/xla/service/service.cc:163] XLA service 0x7980a4005340 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-10-22 11:49:26.540156: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): Tesla V100S-PCIE-32GB, Compute Capability 7.0
2025-10-22 11:49:26.582372: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-10-22 11:49:26.735743: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91002
I0000 00:00:1761126567.852105 2988730 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2025-10-22 11:49:28.702946: E external/local_xla/xla/stream_executor/cuda/cuda_time

[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 291ms/step

2025-10-22 11:49:39.538043: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:39.802049: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 344ms/step


[I 2025-10-22 11:49:40,163] Trial 0 finished with value: 0.775 and parameters: {'cnn_filters': 64, 'kernel_size': 5, 'dense_units': 64, 'learning_rate': 0.0001}. Best is trial 0 with value: 0.775.
2025-10-22 11:49:43.019069: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:43.289064: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:45.252198: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:45.523739: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay 

[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 306ms/step

2025-10-22 11:49:47.904452: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:48.178138: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 346ms/step


[I 2025-10-22 11:49:48,533] Trial 1 finished with value: 0.4383561643835616 and parameters: {'cnn_filters': 128, 'kernel_size': 5, 'dense_units': 32, 'learning_rate': 0.01}. Best is trial 0 with value: 0.775.
2025-10-22 11:49:51.383464: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:51.653657: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:53.580617: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:53.851587: E external/local_xla/xla/stream_executor/cuda/cuda_timer.c

[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 276ms/step

2025-10-22 11:49:59.986695: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:50:00.260549: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 340ms/step


[I 2025-10-22 11:50:00,600] Trial 2 finished with value: 0.9069767441860465 and parameters: {'cnn_filters': 128, 'kernel_size': 4, 'dense_units': 16, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 112ms/step


[I 2025-10-22 11:50:32,115] Trial 3 finished with value: 0.6571428571428571 and parameters: {'cnn_filters': 96, 'kernel_size': 3, 'dense_units': 64, 'learning_rate': 1e-05}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step


[I 2025-10-22 11:50:44,301] Trial 4 finished with value: 0.8 and parameters: {'cnn_filters': 64, 'kernel_size': 5, 'dense_units': 16, 'learning_rate': 0.0001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 113ms/step


[I 2025-10-22 11:51:15,898] Trial 5 finished with value: 0.676056338028169 and parameters: {'cnn_filters': 96, 'kernel_size': 4, 'dense_units': 16, 'learning_rate': 1e-05}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 94ms/step


[I 2025-10-22 11:51:45,585] Trial 6 finished with value: 0.7123287671232876 and parameters: {'cnn_filters': 96, 'kernel_size': 4, 'dense_units': 16, 'learning_rate': 1e-05}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 96ms/step


[I 2025-10-22 11:52:12,646] Trial 7 finished with value: 0.810126582278481 and parameters: {'cnn_filters': 128, 'kernel_size': 4, 'dense_units': 32, 'learning_rate': 1e-05}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 117ms/step


[I 2025-10-22 11:52:20,420] Trial 8 finished with value: 0.8735632183908046 and parameters: {'cnn_filters': 128, 'kernel_size': 3, 'dense_units': 64, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.
2025-10-22 11:52:23.219573: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:52:23.478125: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:52:25.319693: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:52:25.572827: E external/local_xla/xla/stream_executor/cud

[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 265ms/step

2025-10-22 11:52:53.704019: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:52:53.964516: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 329ms/step


[I 2025-10-22 11:52:54,287] Trial 9 finished with value: 0.6470588235294118 and parameters: {'cnn_filters': 64, 'kernel_size': 4, 'dense_units': 16, 'learning_rate': 1e-05}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 95ms/step


[I 2025-10-22 11:53:01,165] Trial 10 finished with value: 0.5974025974025974 and parameters: {'cnn_filters': 128, 'kernel_size': 4, 'dense_units': 16, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 107ms/step


[I 2025-10-22 11:53:09,082] Trial 11 finished with value: 0.8764044943820225 and parameters: {'cnn_filters': 128, 'kernel_size': 3, 'dense_units': 64, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 99ms/step


[I 2025-10-22 11:53:17,162] Trial 12 finished with value: 0.8470588235294118 and parameters: {'cnn_filters': 128, 'kernel_size': 3, 'dense_units': 64, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step


[I 2025-10-22 11:53:24,624] Trial 13 finished with value: 0.8627450980392157 and parameters: {'cnn_filters': 128, 'kernel_size': 3, 'dense_units': 64, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 93ms/step


[I 2025-10-22 11:53:33,163] Trial 14 finished with value: 0.8354430379746836 and parameters: {'cnn_filters': 128, 'kernel_size': 3, 'dense_units': 32, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 93ms/step


[I 2025-10-22 11:53:42,261] Trial 15 finished with value: 0.0 and parameters: {'cnn_filters': 128, 'kernel_size': 4, 'dense_units': 16, 'learning_rate': 0.01}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 96ms/step


[I 2025-10-22 11:53:50,842] Trial 16 finished with value: 0.8433734939759037 and parameters: {'cnn_filters': 128, 'kernel_size': 3, 'dense_units': 64, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 97ms/step


[I 2025-10-22 11:53:57,673] Trial 17 finished with value: 0.8157894736842105 and parameters: {'cnn_filters': 128, 'kernel_size': 4, 'dense_units': 64, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 97ms/step


[I 2025-10-22 11:54:07,377] Trial 18 finished with value: 0.8571428571428571 and parameters: {'cnn_filters': 96, 'kernel_size': 3, 'dense_units': 16, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step


[I 2025-10-22 11:54:17,157] Trial 19 finished with value: 0.7297297297297297 and parameters: {'cnn_filters': 64, 'kernel_size': 5, 'dense_units': 32, 'learning_rate': 0.0001}. Best is trial 2 with value: 0.9069767441860465.


Best parameters: {'cnn_filters': 128, 'kernel_size': 4, 'dense_units': 16, 'learning_rate': 0.001}


In [17]:
# -----------
# Final training and evaluation
# -----------

best_params = study.best_params
best_model = Sequential([
    Input(shape=(max_len, X_train_emb.shape[2])),
    Conv1D(filters=best_params["cnn_filters"], kernel_size=best_params["kernel_size"], activation='relu'),
    GlobalMaxPooling1D(),
    Dense(best_params["dense_units"], activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
best_model.compile(
    optimizer=Adam(learning_rate=best_params["learning_rate"]),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
best_model.fit(
    np.concatenate((X_train_emb, X_val_emb)), 
    np.concatenate((y_train, y_val)),
    epochs=50,
    batch_size=8,
    callbacks=[EarlyStopping(monitor='loss', patience=2, restore_best_weights=True, verbose=0)],
    verbose=0
)

y_pred = (best_model.predict(X_test_emb) > 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Weighted F1-score:", f1_score(y_test, y_pred, average="weighted"))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 97ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95        68
           1       0.95      0.88      0.92        43

    accuracy                           0.94       111
   macro avg       0.94      0.93      0.93       111
weighted avg       0.94      0.94      0.94       111

Confusion Matrix:
[[66  2]
 [ 5 38]]
Weighted F1-score: 0.9364777796779183
