In [1]:
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "0" # oppure "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # usa entrambe le GPU
#os.environ["CUDA_LAUNCH_BLOCKING"] = "0,1"  # per debug

In [2]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


# **LOGISTIC REGRESSION**

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import optuna

In [44]:
# ----------------
# Load dataset
# ----------------

path_file = "datasets/Unipi_NDF/df_ndf.csv"
df = pd.read_csv(path_file, sep="\t", encoding="utf-8")
df.head()

X = df["texts"].astype(str)
y = df["labels"]

# split train/validation/test (0.6 / 0.2 / 0.2): stratify to maintain label distribution
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)


In [45]:
# -----------------------------
# Optuna objective function
# -----------------------------

def objectiveLR(trial):

    # Parameters
    penalty = trial.suggest_categorical("penalty", ["l2", None])                        # L2 reg or no reg
    C = trial.suggest_categorical("C", [0.01, 0.1, 1.0, 10.0, 100.0])                   # LR regularization strength
    solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear", "sag", "saga"]) # solvers supporting L2 and none penalties

    # Some solvers do not support some parameters combinations
    try:
        model = Pipeline([
            ('tfidf', TfidfVectorizer(  # TD-IDF vectorization inside pipeline
                max_features=5000,      # limit to top 5000 features
                ngram_range=(1, 2),     # unigrams + bigrams
                stop_words="english"    # remove English stop words
            )),
            ('clf', LogisticRegression( # Logistic Regression classifier
                C=C,
                penalty=penalty if penalty != "none" else None,
                solver=solver,
                max_iter=1000,
                random_state=42,
            ))
        ])

        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        f1 = f1_score(y_val, preds, average="weighted") # weighted F1-score: average for label imbalance
        return f1
    
    except Exception:
        return 0.0  # Return a default value in case of failure

In [46]:
# ----------------------------
# Hyperparameter optimization
# ----------------------------

study = optuna.create_study(direction="maximize") # maximize F1-score
study.optimize(objectiveLR, n_trials=50) # 50 trials for demonstration; increase for better results

print("Best parameters:", study.best_params)

[I 2025-10-21 17:33:17,817] A new study created in memory with name: no-name-ef853918-9b65-4afe-a715-4599008b341c
[I 2025-10-21 17:33:17,874] Trial 0 finished with value: 0.0 and parameters: {'penalty': None, 'C': 1.0, 'solver': 'liblinear'}. Best is trial 0 with value: 0.0.
[I 2025-10-21 17:33:17,937] Trial 1 finished with value: 0.8807294291927862 and parameters: {'penalty': 'l2', 'C': 100.0, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8807294291927862.
[I 2025-10-21 17:33:18,154] Trial 2 finished with value: 0.8814616818430635 and parameters: {'penalty': None, 'C': 100.0, 'solver': 'saga'}. Best is trial 2 with value: 0.8814616818430635.
[I 2025-10-21 17:33:18,197] Trial 3 finished with value: 0.8417165934407314 and parameters: {'penalty': 'l2', 'C': 10.0, 'solver': 'liblinear'}. Best is trial 2 with value: 0.8814616818430635.
[I 2025-10-21 17:33:18,229] Trial 4 finished with value: 0.0 and parameters: {'penalty': None, 'C': 10.0, 'solver': 'liblinear'}. Best is trial 2 with v

Best parameters: {'penalty': None, 'C': 100.0, 'solver': 'saga'}


In [47]:
# --------------------
# Final training and evaluation
# --------------------

best_params = study.best_params
best_model = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        stop_words="english"
    )),
    ('clf', LogisticRegression( 
        C=best_params["C"],
        penalty=best_params["penalty"] if best_params["penalty"] != "none" else None,
        solver=best_params["solver"],
        max_iter=1000,
        random_state=42,
    ))
])

best_model.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val])) # train on train + validation
y_pred = best_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Weighted F1-score:", f1_score(y_test, y_pred, average="weighted"))



Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.91      0.89        68
           1       0.85      0.79      0.82        43

    accuracy                           0.86       111
   macro avg       0.86      0.85      0.86       111
weighted avg       0.86      0.86      0.86       111

Confusion Matrix:
[[62  6]
 [ 9 34]]
Weighted F1-score: 0.8638809564526821




# **SUPPORT VECTOR MACHINE**

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import optuna

In [49]:
# ----------------
# Load dataset
# ----------------
percorso_file = "datasets/Unipi_NDF/df_ndf.csv"
df = pd.read_csv(percorso_file, sep="\t", encoding="utf-8")
df.head()

X = df["texts"].astype(str)
y = df["labels"]

# split train/validation/test (0.6 / 0.2 / 0.2): stratify to maintain label distribution
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [50]:
# -----------------------------
# Optuna objective function
# -----------------------------

def objectiveSVM(trial):
    penalty = trial.suggest_categorical("penalty", ["l1", "l2"])         # l1 or l2 penalty
    C = trial.suggest_categorical("C", [0.01, 0.1, 1.0, 10.0, 100.0])    # SVM regularization strength
    loss = trial.suggest_categorical("loss", ["hinge", "squared_hinge"]) # loss function
    dual = trial.suggest_categorical("dual", [True, False])              # dual formulation

    # Some parameter combinations are invalid for LinearSVC
    try:
        model = Pipeline([
            ('tfidf', TfidfVectorizer(  # TD-IDF vectorization inside pipeline
                max_features=5000,      # limit to top 5000 features
                ngram_range=(1, 2),     # unigrams + bigrams
                stop_words="english"    # remove English stop words
            )),
            ('clf', LinearSVC(         # Linear SVM classifier
                C=C,
                penalty=penalty,
                loss=loss,
                dual=dual,
                max_iter=2000,
                random_state=42
            ))
        ])

        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        f1 = f1_score(y_val, preds, average="weighted") # weighted F1-score: average for label imbalance
        return f1
    
    except Exception:
        return 0.0  # Return a default value in case of failure

In [51]:
# ----------------------------
# Hyperparameter optimization
# ----------------------------

study = optuna.create_study(direction="maximize") # maximize F1-score
study.optimize(objectiveSVM, n_trials=50) # 50 trials for demonstration; increase for better results

print("Best parameters:", study.best_params)

[I 2025-10-21 17:33:25,743] A new study created in memory with name: no-name-67d32e58-1edb-4097-8dc1-5e43fb63d35e
[I 2025-10-21 17:33:25,828] Trial 0 finished with value: 0.8814616818430635 and parameters: {'penalty': 'l2', 'C': 100.0, 'loss': 'squared_hinge', 'dual': True}. Best is trial 0 with value: 0.8814616818430635.
[I 2025-10-21 17:33:25,860] Trial 1 finished with value: 0.0 and parameters: {'penalty': 'l1', 'C': 0.01, 'loss': 'hinge', 'dual': True}. Best is trial 0 with value: 0.8814616818430635.
[I 2025-10-21 17:33:25,902] Trial 2 finished with value: 0.45439821844316225 and parameters: {'penalty': 'l1', 'C': 0.1, 'loss': 'squared_hinge', 'dual': False}. Best is trial 0 with value: 0.8814616818430635.
[I 2025-10-21 17:33:25,935] Trial 3 finished with value: 0.0 and parameters: {'penalty': 'l1', 'C': 0.1, 'loss': 'squared_hinge', 'dual': True}. Best is trial 0 with value: 0.8814616818430635.
[I 2025-10-21 17:33:25,968] Trial 4 finished with value: 0.0 and parameters: {'penalty'

Best parameters: {'penalty': 'l2', 'C': 100.0, 'loss': 'squared_hinge', 'dual': False}


In [52]:
# --------------------
# Final training and evaluation
# --------------------

best_params = study.best_params
best_model = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        stop_words="english"
    )),
    ('clf', LinearSVC(
        C=best_params["C"],
        penalty=best_params["penalty"],
        loss=best_params["loss"],
        dual=best_params["dual"],
        max_iter=2000,
        random_state=42
    ))
])

best_model.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val])) # train on train + validation
y_pred = best_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Weighted F1-score:", f1_score(y_test, y_pred, average="weighted"))

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.90      0.88        68
           1       0.83      0.79      0.81        43

    accuracy                           0.86       111
   macro avg       0.85      0.84      0.85       111
weighted avg       0.86      0.86      0.86       111

Confusion Matrix:
[[61  7]
 [ 9 34]]
Weighted F1-score: 0.8551843769235075


# **NAIVE BAYES**

In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import optuna

In [54]:
# ----------------
# Load dataset
# ----------------

path_file = "datasets/Unipi_NDF/df_ndf.csv"
df = pd.read_csv(path_file, sep="\t", encoding="utf-8")
df.head()

X = df["texts"].astype(str)
y = df["labels"]

# split train/validation/test (0.6 / 0.2 / 0.2): stratify to maintain label distribution
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [55]:
# -----------------------------
# Optuna objective function
# -----------------------------

def objectiveNB(trial):
    alpha = trial.suggest_categorical("alpha", [0.01, 0.1, 1.0, 10.0, 100.0]) # smoothing parameter
    fit_prior = trial.suggest_categorical("fit_prior", [True, False])   # whether to learn class prior probabilities

    model = Pipeline([
        ('tfidf', TfidfVectorizer(  # TD-IDF vectorization inside pipeline
            max_features=5000,      # limit to top 5000 features
            ngram_range=(1, 2),     # unigrams + bigrams
            stop_words="english"    # remove English stop words
        )),
        ('clf', MultinomialNB(      # Multinomial Naive Bayes classifier
            alpha=alpha,
            fit_prior=fit_prior
        ))
    ])

    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    f1 = f1_score(y_val, preds, average="weighted") # weighted F1-score: average for label imbalance
    return f1

In [56]:
# ----------------------------
# Hyperparameter optimization
# ----------------------------

study = optuna.create_study(direction="maximize") # maximize F1-score
study.optimize(objectiveNB, n_trials=50) # 50 trials for demonstration; increase for better results

print("Best parameters:", study.best_params)

[I 2025-10-21 17:33:28,813] A new study created in memory with name: no-name-6a15be51-ebee-4db9-89f4-b6383361a328
[I 2025-10-21 17:33:28,865] Trial 0 finished with value: 0.8908846734933691 and parameters: {'alpha': 0.01, 'fit_prior': True}. Best is trial 0 with value: 0.8908846734933691.
[I 2025-10-21 17:33:28,906] Trial 1 finished with value: 0.8050227461992169 and parameters: {'alpha': 1.0, 'fit_prior': False}. Best is trial 0 with value: 0.8908846734933691.
[I 2025-10-21 17:33:28,947] Trial 2 finished with value: 0.5977497859850802 and parameters: {'alpha': 100.0, 'fit_prior': False}. Best is trial 0 with value: 0.8908846734933691.
[I 2025-10-21 17:33:28,987] Trial 3 finished with value: 0.8908846734933691 and parameters: {'alpha': 0.01, 'fit_prior': True}. Best is trial 0 with value: 0.8908846734933691.
[I 2025-10-21 17:33:29,028] Trial 4 finished with value: 0.8050227461992169 and parameters: {'alpha': 1.0, 'fit_prior': False}. Best is trial 0 with value: 0.8908846734933691.
[I 2

Best parameters: {'alpha': 0.01, 'fit_prior': True}


In [57]:
# --------------------
# Final training and evaluation
# --------------------

best_params = study.best_params
best_model = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        stop_words="english"
    )),
    ('clf', MultinomialNB(
        alpha=best_params["alpha"],
        fit_prior=best_params["fit_prior"]
    ))
])

best_model.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val])) # train on train + validation
y_pred = best_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Weighted F1-score:", f1_score(y_test, y_pred, average="weighted"))

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91        68
           1       0.88      0.81      0.84        43

    accuracy                           0.88       111
   macro avg       0.88      0.87      0.87       111
weighted avg       0.88      0.88      0.88       111

Confusion Matrix:
[[63  5]
 [ 8 35]]
Weighted F1-score: 0.8820301622589912


# **CNN**

In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from gensim.models import KeyedVectors
import optuna

In [59]:
# ----------------
# Load dataset
# ----------------

path_file = "datasets/Unipi_NDF/df_ndf.csv"
df = pd.read_csv(path_file, sep="\t", encoding="utf-8")
df.head()

texts = df["texts"].astype(str).tolist()
labels = df["labels"].values

In [60]:
# ------------------------------
# Preprocessing and tokenization
# ------------------------------

max_words = 20000 # consider only the top max_words
max_len = 300     # max length of sequences

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>") # tokenizer with OOV token
tokenizer.fit_on_texts(texts) # create vocabulary {word: index}

sequences = tokenizer.texts_to_sequences(texts) # convert texts to sequences of integers
X = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post") # padd/truncate sequences to max_len

encoder = LabelEncoder() # encode labels to integers
y = encoder.fit_transform(labels)

# split train/validation/test (0.6 / 0.2 / 0.2): stratify to maintain label distribution
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [61]:
# -----------------------------------------------------------
# Load pre-trained Word2Vec embeddings (Google News)
# -----------------------------------------------------------

print("Loading pre-trained Word2Vec model (may take time)...")
w2v_path = "Word2Vec_GoogleNews300/word2vec-google-news-300.model"
w2v_model = KeyedVectors.load(w2v_path, mmap='r') # load model with memory mapping (mmap='r' for only reading)

embedding_dim = 300                             # dimension of Word2Vec embeddings
word_index = tokenizer.word_index               # vocabulary from tokenizer {word: index}
num_words = min(max_words, len(word_index) + 1) # number of words to consider (max_words or vocab size)

# create embedding matrix: each row corresponds to a word index from tokenizer, each column to an embedding dimension
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_words: # skip because we only consider top max_words
        continue
    if word in w2v_model: # if word has a pre-trained embedding
        embedding_matrix[i] = w2v_model[word]

Loading pre-trained Word2Vec model (may take time)...


In [62]:
# ---------------------------
# Optuna objective function
# ---------------------------

def objectiveCNN(trial):
    filter_size = trial.suggest_categorical("filter_size", [3, 4, 5])                    # size of convolutional filters
    num_filters = trial.suggest_categorical("num_filters", [16, 32, 64, 96, 128])        # number of convolutional filters
    dropout = trial.suggest_categorical("dropout", [0.2, 0.4, 0.6, 0.8])                 # dropout rate
    hidden_units = trial.suggest_categorical("hidden_units", [8, 16, 32, 64])            # number of units in dense layer
    learning_rate = trial.suggest_categorical("learning_rate", [1e-5, 1e-4, 1e-3, 1e-2]) # learning rate for Adam optimizer

    # CNN model
    model = Sequential([
        Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True), # embedding layer with pre-trained weights
        Dropout(dropout),
        Conv1D(num_filters, filter_size, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(hidden_units, activation='relu'),
        Dropout(dropout),
        Dense(1, activation='sigmoid')
    ])

    # Compile model with Adam optimizer
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Train model with early stopping
    es = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, verbose=0)
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50,
        batch_size=8,
        callbacks=[es], # early stopping
        verbose=0
    )

    #dummy_input = np.zeros((1, max_len), dtype='int32')  # input fittizio per inizializzare i layer
    #model(dummy_input)  # forza la costruzione del modello
    #model.summary()

    preds_val = (model.predict(X_val) > 0.5).astype(int)
    f1 = f1_score(y_val, preds_val, average='weighted') # weighted F1-score: average for label imbalance
    return f1

In [63]:
# ----------------------------
# Hyperparameter optimization
# ----------------------------

study = optuna.create_study(direction="maximize") # maximize F1-score
study.optimize(objectiveCNN, n_trials=50) # 50 trials for demonstration; increase for better results

print("Best parameters:", study.best_params)

[I 2025-10-21 17:33:50,949] A new study created in memory with name: no-name-5df5b918-2f5e-4b9b-a1d6-a0a1b01623dd


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 121ms/step


[I 2025-10-21 17:33:58,815] Trial 0 finished with value: 0.862380110607061 and parameters: {'filter_size': 5, 'num_filters': 128, 'dropout': 0.4, 'hidden_units': 64, 'learning_rate': 0.01}. Best is trial 0 with value: 0.862380110607061.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 113ms/step


[I 2025-10-21 17:34:08,179] Trial 1 finished with value: 0.45439821844316225 and parameters: {'filter_size': 3, 'num_filters': 128, 'dropout': 0.4, 'hidden_units': 16, 'learning_rate': 0.01}. Best is trial 0 with value: 0.862380110607061.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 108ms/step


[I 2025-10-21 17:34:15,529] Trial 2 finished with value: 0.8711204161908388 and parameters: {'filter_size': 5, 'num_filters': 64, 'dropout': 0.6, 'hidden_units': 64, 'learning_rate': 0.01}. Best is trial 2 with value: 0.8711204161908388.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 107ms/step


[I 2025-10-21 17:34:37,406] Trial 3 finished with value: 0.8614197981286588 and parameters: {'filter_size': 5, 'num_filters': 64, 'dropout': 0.2, 'hidden_units': 16, 'learning_rate': 0.0001}. Best is trial 2 with value: 0.8711204161908388.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step


[I 2025-10-21 17:35:04,968] Trial 4 finished with value: 0.6635222270209655 and parameters: {'filter_size': 3, 'num_filters': 128, 'dropout': 0.2, 'hidden_units': 16, 'learning_rate': 1e-05}. Best is trial 2 with value: 0.8711204161908388.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 111ms/step


[I 2025-10-21 17:35:12,342] Trial 5 finished with value: 0.8403456517742233 and parameters: {'filter_size': 3, 'num_filters': 32, 'dropout': 0.4, 'hidden_units': 16, 'learning_rate': 0.01}. Best is trial 2 with value: 0.8711204161908388.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 112ms/step


[I 2025-10-21 17:35:21,137] Trial 6 finished with value: 0.9179350105067362 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.4, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 113ms/step


[I 2025-10-21 17:35:49,574] Trial 7 finished with value: 0.7596647280191584 and parameters: {'filter_size': 3, 'num_filters': 32, 'dropout': 0.4, 'hidden_units': 8, 'learning_rate': 0.0001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 113ms/step


[I 2025-10-21 17:36:03,773] Trial 8 finished with value: 0.8807294291927862 and parameters: {'filter_size': 5, 'num_filters': 128, 'dropout': 0.2, 'hidden_units': 64, 'learning_rate': 0.0001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step


[I 2025-10-21 17:36:14,387] Trial 9 finished with value: 0.45439821844316225 and parameters: {'filter_size': 5, 'num_filters': 128, 'dropout': 0.6, 'hidden_units': 16, 'learning_rate': 0.01}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 111ms/step


[I 2025-10-21 17:36:21,390] Trial 10 finished with value: 0.45439821844316225 and parameters: {'filter_size': 4, 'num_filters': 96, 'dropout': 0.8, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 108ms/step


[I 2025-10-21 17:36:31,104] Trial 11 finished with value: 0.8603381706829983 and parameters: {'filter_size': 5, 'num_filters': 16, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step


[I 2025-10-21 17:36:39,965] Trial 12 finished with value: 0.45439821844316225 and parameters: {'filter_size': 4, 'num_filters': 96, 'dropout': 0.8, 'hidden_units': 64, 'learning_rate': 0.0001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 111ms/step


[I 2025-10-21 17:36:48,308] Trial 13 finished with value: 0.879897158378171 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 107ms/step


[I 2025-10-21 17:37:16,119] Trial 14 finished with value: 0.45439821844316225 and parameters: {'filter_size': 5, 'num_filters': 16, 'dropout': 0.4, 'hidden_units': 8, 'learning_rate': 1e-05}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step


[I 2025-10-21 17:37:33,082] Trial 15 finished with value: 0.8895317853064332 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 64, 'learning_rate': 0.0001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step


[I 2025-10-21 17:37:45,299] Trial 16 finished with value: 0.879897158378171 and parameters: {'filter_size': 4, 'num_filters': 96, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step


[I 2025-10-21 17:37:57,954] Trial 17 finished with value: 0.45439821844316225 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.8, 'hidden_units': 64, 'learning_rate': 0.0001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 107ms/step


[I 2025-10-21 17:38:06,387] Trial 18 finished with value: 0.8814616818430635 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step


[I 2025-10-21 17:38:34,270] Trial 19 finished with value: 0.5653844542216635 and parameters: {'filter_size': 4, 'num_filters': 96, 'dropout': 0.4, 'hidden_units': 8, 'learning_rate': 1e-05}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step


[I 2025-10-21 17:38:48,774] Trial 20 finished with value: 0.8701682451682452 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 64, 'learning_rate': 0.0001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step


[I 2025-10-21 17:38:57,177] Trial 21 finished with value: 0.9090705612444742 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 113ms/step


[I 2025-10-21 17:39:06,181] Trial 22 finished with value: 0.9010850844085616 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 108ms/step


[I 2025-10-21 17:39:15,201] Trial 23 finished with value: 0.8711204161908388 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 114ms/step


[I 2025-10-21 17:39:23,527] Trial 24 finished with value: 0.879897158378171 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 107ms/step


[I 2025-10-21 17:39:32,825] Trial 25 finished with value: 0.821562787277073 and parameters: {'filter_size': 5, 'num_filters': 16, 'dropout': 0.4, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step


[I 2025-10-21 17:39:48,668] Trial 26 finished with value: 0.8179719687974514 and parameters: {'filter_size': 5, 'num_filters': 32, 'dropout': 0.8, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step


[I 2025-10-21 17:39:59,451] Trial 27 finished with value: 0.8603381706829983 and parameters: {'filter_size': 5, 'num_filters': 64, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 108ms/step


[I 2025-10-21 17:40:07,827] Trial 28 finished with value: 0.8895317853064332 and parameters: {'filter_size': 3, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 103ms/step


[I 2025-10-21 17:40:17,354] Trial 29 finished with value: 0.8711204161908388 and parameters: {'filter_size': 4, 'num_filters': 96, 'dropout': 0.4, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step


[I 2025-10-21 17:40:27,086] Trial 30 finished with value: 0.879897158378171 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.4, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step


[I 2025-10-21 17:40:45,229] Trial 31 finished with value: 0.879897158378171 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 64, 'learning_rate': 0.0001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step


[I 2025-10-21 17:40:54,138] Trial 32 finished with value: 0.8895317853064332 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 64, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 108ms/step


[I 2025-10-21 17:41:01,138] Trial 33 finished with value: 0.853671092695483 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.01}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 105ms/step


[I 2025-10-21 17:41:18,516] Trial 34 finished with value: 0.8301680058436814 and parameters: {'filter_size': 5, 'num_filters': 64, 'dropout': 0.2, 'hidden_units': 8, 'learning_rate': 0.0001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 113ms/step


[I 2025-10-21 17:41:46,251] Trial 35 finished with value: 0.7153516419645453 and parameters: {'filter_size': 5, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 64, 'learning_rate': 1e-05}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 311ms/step


[I 2025-10-21 17:41:59,591] Trial 36 finished with value: 0.8403456517742233 and parameters: {'filter_size': 3, 'num_filters': 32, 'dropout': 0.6, 'hidden_units': 16, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 111ms/step


[I 2025-10-21 17:42:07,210] Trial 37 finished with value: 0.8711204161908388 and parameters: {'filter_size': 5, 'num_filters': 64, 'dropout': 0.4, 'hidden_units': 32, 'learning_rate': 0.01}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step


[I 2025-10-21 17:42:24,978] Trial 38 finished with value: 0.8711204161908388 and parameters: {'filter_size': 3, 'num_filters': 128, 'dropout': 0.2, 'hidden_units': 64, 'learning_rate': 0.0001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 120ms/step


[I 2025-10-21 17:42:53,029] Trial 39 finished with value: 0.47458305763390507 and parameters: {'filter_size': 5, 'num_filters': 16, 'dropout': 0.2, 'hidden_units': 16, 'learning_rate': 1e-05}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step


[I 2025-10-21 17:43:02,971] Trial 40 finished with value: 0.8503997869535 and parameters: {'filter_size': 5, 'num_filters': 128, 'dropout': 0.4, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 107ms/step


[I 2025-10-21 17:43:11,753] Trial 41 finished with value: 0.8807294291927862 and parameters: {'filter_size': 3, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step


[I 2025-10-21 17:43:20,076] Trial 42 finished with value: 0.8996983461749 and parameters: {'filter_size': 3, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step


[I 2025-10-21 17:43:28,744] Trial 43 finished with value: 0.8807294291927862 and parameters: {'filter_size': 3, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 6 with value: 0.9179350105067362.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 114ms/step


[I 2025-10-21 17:43:38,910] Trial 44 finished with value: 0.9187532359946153 and parameters: {'filter_size': 3, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 44 with value: 0.9187532359946153.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step


[I 2025-10-21 17:43:47,873] Trial 45 finished with value: 0.8814616818430635 and parameters: {'filter_size': 3, 'num_filters': 32, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 44 with value: 0.9187532359946153.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 111ms/step


[I 2025-10-21 17:43:57,208] Trial 46 finished with value: 0.45439821844316225 and parameters: {'filter_size': 3, 'num_filters': 96, 'dropout': 0.8, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 44 with value: 0.9187532359946153.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 119ms/step


[I 2025-10-21 17:44:05,641] Trial 47 finished with value: 0.8814616818430635 and parameters: {'filter_size': 3, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 44 with value: 0.9187532359946153.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 108ms/step


[I 2025-10-21 17:44:14,737] Trial 48 finished with value: 0.8996983461749 and parameters: {'filter_size': 3, 'num_filters': 96, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.01}. Best is trial 44 with value: 0.9187532359946153.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step


[I 2025-10-21 17:44:23,796] Trial 49 finished with value: 0.8895317853064332 and parameters: {'filter_size': 3, 'num_filters': 128, 'dropout': 0.2, 'hidden_units': 8, 'learning_rate': 0.001}. Best is trial 44 with value: 0.9187532359946153.


Best parameters: {'filter_size': 3, 'num_filters': 96, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.001}


In [64]:
# -----------
# Final training and evaluation
# -----------

best_params = study.best_params
best_model = Sequential([
    Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True),
    Dropout(best_params["dropout"]),
    Conv1D(best_params["num_filters"], best_params["filter_size"], activation='relu'),
    GlobalMaxPooling1D(),
    Dense(best_params["hidden_units"], activation='relu'),
    Dropout(best_params["dropout"]),
    Dense(1, activation='sigmoid')
])

best_model.compile(
    optimizer=Adam(learning_rate=best_params["learning_rate"]),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
best_model.fit(
    np.concatenate((X_train, X_val)), 
    np.concatenate((y_train, y_val)),
    epochs=50,
    batch_size=8,
    callbacks=[EarlyStopping(monitor='loss', patience=2, restore_best_weights=True, verbose=0)],
    verbose=0
)
y_pred = (best_model.predict(X_test) > 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Weighted F1-score:", f1_score(y_test, y_pred, average="weighted"))



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 106ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.96      0.93        68
           1       0.92      0.84      0.88        43

    accuracy                           0.91       111
   macro avg       0.91      0.90      0.90       111
weighted avg       0.91      0.91      0.91       111

Confusion Matrix:
[[65  3]
 [ 7 36]]
Weighted F1-score: 0.9089995919264212


# **BIDIRECTIONAL LSTM**

In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from gensim.models import KeyedVectors
import optuna

In [67]:
# ----------------
# Load dataset
# ----------------

path_file = "datasets/Unipi_NDF/df_ndf.csv"
df = pd.read_csv(path_file, sep="\t", encoding="utf-8")
df.head()

texts = df["texts"].astype(str).tolist()
labels = df["labels"].values

In [68]:
# ------------------------------
# Preprocessing and tokenization
# ------------------------------
max_words = 20000     # vocabulary size
max_len = 300         # maximum sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>") # tokenizer with OOV token
tokenizer.fit_on_texts(texts) # create vocabulary {word: index}

sequences = tokenizer.texts_to_sequences(texts) # convert texts to sequences of integers
X = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post") # padd/truncate sequences to max_len

encoder = LabelEncoder() # encode labels to integers
y = encoder.fit_transform(labels)

# split train/validation/test (0.6 / 0.2 / 0.2): stratify to maintain label distribution
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [69]:
# -----------------------------------------------------------
# Load pre-trained Word2Vec embeddings (Google News)
# -----------------------------------------------------------

print("Loading pre-trained Word2Vec model (may take time)...")
w2v_path = "Word2Vec_GoogleNews300/word2vec-google-news-300.model"
w2v_model = KeyedVectors.load(w2v_path, mmap='r') # load model with memory mapping (mmap='r' for only reading)

embedding_dim = 300                             # dimension of Word2Vec embeddings
word_index = tokenizer.word_index               # vocabulary from tokenizer {word: index}
num_words = min(max_words, len(word_index) + 1) # number of words to consider (max_words or vocab size)

# create embedding matrix: each row corresponds to a word index from tokenizer, each column to an embedding dimension
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_words: # skip because we only consider top max_words
        continue
    if word in w2v_model: # if word has a pre-trained embedding
        embedding_matrix[i] = w2v_model[word]

Loading pre-trained Word2Vec model (may take time)...


In [70]:
# ------------------------------
# Optuna objective function
# ------------------------------

def objectiveBiLSTM(trial):
    num_units = trial.suggest_categorical("num_units", [16, 32, 64, 96, 128])            # number of LSTM units
    dropout = trial.suggest_categorical("dropout", [0.2, 0.4, 0.6, 0.8])                 # dropout rate
    hidden_units = trial.suggest_categorical("hidden_units", [8, 16, 32, 64])            # number of units in dense layer
    learning_rate = trial.suggest_categorical("learning_rate", [1e-5, 1e-4, 1e-3, 1e-2]) # learning rate for Adam optimizer

    # BiLSTM model
    model = Sequential([
        Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True), # embedding layer with pre-trained weights
        Bidirectional(LSTM(num_units, return_sequences=False, dropout=dropout, recurrent_dropout=0.0)),
        Dense(hidden_units, activation='relu'),
        Dropout(dropout),
        Dense(1, activation='sigmoid')
    ])

    # Compile model with Adam optimizer
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Train model with early stopping
    es = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, verbose=0)
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=50,
        batch_size=8,
        callbacks=[es], # early stopping
        verbose=0
    )

    # dummy_input = np.zeros((1, max_len), dtype='int32')  # input fittizio per inizializzare i layer
    # model(dummy_input)  # forza la costruzione del modello
    # model.summary()

    preds_val = (model.predict(X_val) > 0.5).astype(int)
    f1 = f1_score(y_val, preds_val)
    return f1

In [71]:
# ----------------------------
# Hyperparameter optimization
# ----------------------------

study = optuna.create_study(direction="maximize") # maximize F1-score
study.optimize(objectiveBiLSTM, n_trials=50) # 50 trials for demonstration

print("Best parameters:", study.best_params)

[I 2025-10-21 17:50:36,732] A new study created in memory with name: no-name-b3503c06-a6de-448f-bc4b-21ac122ee401


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


[I 2025-10-21 17:50:57,053] Trial 0 finished with value: 0.8372093023255814 and parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 0 with value: 0.8372093023255814.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


[I 2025-10-21 17:51:12,355] Trial 1 finished with value: 0.0 and parameters: {'num_units': 64, 'dropout': 0.6, 'hidden_units': 16, 'learning_rate': 0.01}. Best is trial 0 with value: 0.8372093023255814.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step


[I 2025-10-21 17:51:21,374] Trial 2 finished with value: 0.7469879518072289 and parameters: {'num_units': 96, 'dropout': 0.4, 'hidden_units': 64, 'learning_rate': 0.01}. Best is trial 0 with value: 0.8372093023255814.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step


[I 2025-10-21 17:53:09,470] Trial 3 finished with value: 0.044444444444444446 and parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 16, 'learning_rate': 1e-05}. Best is trial 0 with value: 0.8372093023255814.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step


[I 2025-10-21 17:53:26,993] Trial 4 finished with value: 0.9069767441860465 and parameters: {'num_units': 16, 'dropout': 0.8, 'hidden_units': 64, 'learning_rate': 0.01}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


[I 2025-10-21 17:53:40,161] Trial 5 finished with value: 0.8470588235294118 and parameters: {'num_units': 96, 'dropout': 0.2, 'hidden_units': 16, 'learning_rate': 0.001}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step


[I 2025-10-21 17:53:55,575] Trial 6 finished with value: 0.8888888888888888 and parameters: {'num_units': 128, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step


[I 2025-10-21 17:54:12,155] Trial 7 finished with value: 0.8333333333333334 and parameters: {'num_units': 64, 'dropout': 0.4, 'hidden_units': 8, 'learning_rate': 0.001}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step


[I 2025-10-21 17:56:01,903] Trial 8 finished with value: 0.0 and parameters: {'num_units': 128, 'dropout': 0.6, 'hidden_units': 16, 'learning_rate': 1e-05}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step


[I 2025-10-21 17:57:41,447] Trial 9 finished with value: 0.5084745762711864 and parameters: {'num_units': 32, 'dropout': 0.8, 'hidden_units': 64, 'learning_rate': 0.0001}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step


[I 2025-10-21 17:57:52,539] Trial 10 finished with value: 0.7222222222222222 and parameters: {'num_units': 16, 'dropout': 0.8, 'hidden_units': 64, 'learning_rate': 0.01}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step


[I 2025-10-21 17:58:58,942] Trial 11 finished with value: 0.7567567567567568 and parameters: {'num_units': 128, 'dropout': 0.8, 'hidden_units': 32, 'learning_rate': 0.0001}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step


[I 2025-10-21 17:59:08,097] Trial 12 finished with value: 0.5084745762711864 and parameters: {'num_units': 128, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.01}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step


[I 2025-10-21 17:59:17,177] Trial 13 finished with value: 0.12244897959183673 and parameters: {'num_units': 32, 'dropout': 0.8, 'hidden_units': 8, 'learning_rate': 0.001}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step


[I 2025-10-21 17:59:32,638] Trial 14 finished with value: 0.525 and parameters: {'num_units': 128, 'dropout': 0.8, 'hidden_units': 64, 'learning_rate': 0.01}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step


[I 2025-10-21 17:59:51,482] Trial 15 finished with value: 0.8863636363636364 and parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step


[I 2025-10-21 18:01:43,108] Trial 16 finished with value: 0.6153846153846154 and parameters: {'num_units': 128, 'dropout': 0.4, 'hidden_units': 64, 'learning_rate': 1e-05}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


[I 2025-10-21 18:02:18,123] Trial 17 finished with value: 0.8863636363636364 and parameters: {'num_units': 16, 'dropout': 0.2, 'hidden_units': 32, 'learning_rate': 0.0001}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step


[I 2025-10-21 18:02:31,447] Trial 18 finished with value: 0.0 and parameters: {'num_units': 32, 'dropout': 0.8, 'hidden_units': 8, 'learning_rate': 0.01}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


[I 2025-10-21 18:02:49,024] Trial 19 finished with value: 0.8181818181818182 and parameters: {'num_units': 96, 'dropout': 0.6, 'hidden_units': 64, 'learning_rate': 0.001}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


[I 2025-10-21 18:03:14,058] Trial 20 finished with value: 0.813953488372093 and parameters: {'num_units': 64, 'dropout': 0.8, 'hidden_units': 32, 'learning_rate': 0.01}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


[I 2025-10-21 18:03:33,631] Trial 21 finished with value: 0.8631578947368421 and parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 4 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


[I 2025-10-21 18:03:53,319] Trial 22 finished with value: 0.9213483146067416 and parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step


[I 2025-10-21 18:04:10,965] Trial 23 finished with value: 0.8505747126436781 and parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step


[I 2025-10-21 18:04:29,621] Trial 24 finished with value: 0.7848101265822784 and parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


[I 2025-10-21 18:04:44,944] Trial 25 finished with value: 0.8695652173913043 and parameters: {'num_units': 128, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step


[I 2025-10-21 18:06:34,774] Trial 26 finished with value: 0.0 and parameters: {'num_units': 16, 'dropout': 0.2, 'hidden_units': 64, 'learning_rate': 1e-05}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step


[I 2025-10-21 18:07:16,193] Trial 27 finished with value: 0.7901234567901234 and parameters: {'num_units': 16, 'dropout': 0.4, 'hidden_units': 8, 'learning_rate': 0.0001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step


[I 2025-10-21 18:07:30,563] Trial 28 finished with value: 0.8604651162790697 and parameters: {'num_units': 128, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step


[I 2025-10-21 18:07:54,813] Trial 29 finished with value: 0.8354430379746836 and parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


[I 2025-10-21 18:08:06,075] Trial 30 finished with value: 0.6666666666666666 and parameters: {'num_units': 32, 'dropout': 0.8, 'hidden_units': 64, 'learning_rate': 0.01}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step


[I 2025-10-21 18:08:25,674] Trial 31 finished with value: 0.8095238095238095 and parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


[I 2025-10-21 18:08:41,070] Trial 32 finished with value: 0.7777777777777778 and parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


[I 2025-10-21 18:09:01,562] Trial 33 finished with value: 0.8913043478260869 and parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step


[I 2025-10-21 18:09:19,156] Trial 34 finished with value: 0.9010989010989011 and parameters: {'num_units': 64, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step


[I 2025-10-21 18:09:30,193] Trial 35 finished with value: 0.5757575757575758 and parameters: {'num_units': 64, 'dropout': 0.6, 'hidden_units': 16, 'learning_rate': 0.01}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


[I 2025-10-21 18:09:47,256] Trial 36 finished with value: 0.8863636363636364 and parameters: {'num_units': 64, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


[I 2025-10-21 18:10:07,731] Trial 37 finished with value: 0.8541666666666666 and parameters: {'num_units': 64, 'dropout': 0.4, 'hidden_units': 16, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step


[I 2025-10-21 18:11:56,964] Trial 38 finished with value: 0.0 and parameters: {'num_units': 96, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 1e-05}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step


[I 2025-10-21 18:12:08,033] Trial 39 finished with value: 0.8292682926829268 and parameters: {'num_units': 64, 'dropout': 0.2, 'hidden_units': 64, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step


[I 2025-10-21 18:12:19,021] Trial 40 finished with value: 0.8051948051948052 and parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 16, 'learning_rate': 0.01}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step


[I 2025-10-21 18:12:35,624] Trial 41 finished with value: 0.8292682926829268 and parameters: {'num_units': 64, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


[I 2025-10-21 18:12:51,229] Trial 42 finished with value: 0.8636363636363636 and parameters: {'num_units': 96, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


[I 2025-10-21 18:13:09,180] Trial 43 finished with value: 0.8863636363636364 and parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step


[I 2025-10-21 18:14:07,507] Trial 44 finished with value: 0.37037037037037035 and parameters: {'num_units': 128, 'dropout': 0.8, 'hidden_units': 8, 'learning_rate': 0.0001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 90ms/step


[I 2025-10-21 18:14:29,836] Trial 45 finished with value: 0.8571428571428571 and parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step


[I 2025-10-21 18:14:45,311] Trial 46 finished with value: 0.8636363636363636 and parameters: {'num_units': 64, 'dropout': 0.4, 'hidden_units': 64, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step


[I 2025-10-21 18:16:35,900] Trial 47 finished with value: 0.0 and parameters: {'num_units': 128, 'dropout': 0.8, 'hidden_units': 32, 'learning_rate': 1e-05}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


[I 2025-10-21 18:16:47,013] Trial 48 finished with value: 0.4406779661016949 and parameters: {'num_units': 32, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.01}. Best is trial 22 with value: 0.9213483146067416.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


[I 2025-10-21 18:17:00,108] Trial 49 finished with value: 0.8333333333333334 and parameters: {'num_units': 16, 'dropout': 0.2, 'hidden_units': 64, 'learning_rate': 0.001}. Best is trial 22 with value: 0.9213483146067416.


Best parameters: {'num_units': 16, 'dropout': 0.6, 'hidden_units': 32, 'learning_rate': 0.001}


In [73]:
# -----------
# Final training and evaluation
# -----------

best_params = study.best_params
best_model = Sequential([
    Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True),
    Bidirectional(LSTM(best_params["num_units"], return_sequences=False, dropout=best_params["dropout"], recurrent_dropout=0.0)),
    Dense(best_params["hidden_units"], activation='relu'),
    Dropout(best_params["dropout"]),
    Dense(1, activation='sigmoid')
])
best_model.compile(
    optimizer=Adam(learning_rate=best_params["learning_rate"]),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
best_model.fit(
    np.concatenate((X_train, X_val)), 
    np.concatenate((y_train, y_val)),
    epochs=50,
    batch_size=8,
    callbacks=[EarlyStopping(monitor='loss', patience=2, restore_best_weights=True, verbose=0)],
    verbose=0
)
y_pred = (best_model.predict(X_test) > 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Weighted F1-score:", f1_score(y_test, y_pred, average="weighted"))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.90      0.89        68
           1       0.83      0.81      0.82        43

    accuracy                           0.86       111
   macro avg       0.86      0.86      0.86       111
weighted avg       0.86      0.86      0.86       111

Confusion Matrix:
[[61  7]
 [ 8 35]]
Weighted F1-score: 0.8645631462290972


# **CNN-BERT (FakeBERT)**

In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
import optuna

In [5]:
# ----------------
# Load dataset
# ----------------

path_file = "datasets/Unipi_NDF/df_ndf.csv"
df = pd.read_csv(path_file, sep="\t", encoding="utf-8")
df.head()

texts = df["texts"].astype(str).tolist()
labels = df["labels"].values

In [10]:
# ------------------------------
# Preprocessing and tokenization
# ------------------------------

# split train/validation/test (0.6 / 0.2 / 0.2): stratify to maintain label distribution
X_train, X_temp, y_train, y_temp = train_test_split(texts, labels, test_size=0.4, stratify=labels, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
bert_model = TFBertModel.from_pretrained("bert-base-cased", from_pt=True)

max_len = 128 # maximum length for BERT inputs

def get_bert_embeddings(texts):
    """ Extract BERT [CLS] embeddings for a list of texts. """
    input_enc = tokenizer(
        texts, truncation=True, padding=True, max_length=max_len, return_tensors='tf'
    )
    outputs = bert_model(input_enc)

    # For each input, get the embeddings of all tokens
    token_embeddings = outputs.last_hidden_state  # shape: (N, max_len, 768)
    return token_embeddings.numpy()

# Extract BERT embeddings for train, validation, and test sets
X_train_emb = get_bert_embeddings(X_train)
X_val_emb = get_bert_embeddings(X_val)
X_test_emb = get_bert_embeddings(X_test)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [13]:
# ----------------------------
# Optuna objective function
# ----------------------------

def objectiveCNNBERT(trial):
    cnn_filters = trial.suggest_categorical("cnn_filters", [64, 96, 128])
    kernel_size = trial.suggest_categorical("kernel_size", [3, 4, 5])
    dense_units = trial.suggest_categorical("dense_units", [16, 32, 64])
    learning_rate = trial.suggest_categorical("learning_rate", [1e-5, 1e-4, 1e-3, 1e-2])

    model = Sequential([
        Input(shape=(max_len, X_train_emb.shape[2])),
        Conv1D(filters=cnn_filters, kernel_size=kernel_size, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(dense_units, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy', metrics=['accuracy'])

    es = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, verbose=0)
    model.fit(X_train_emb, y_train,
              validation_data=(X_val_emb, y_val),
              epochs=50,
              batch_size=8,
              callbacks=[es],
              verbose=0)

    preds_val = (model.predict(X_val_emb) > 0.5).astype(int)
    f1 = f1_score(y_val, preds_val)
    return f1

In [14]:
# ----------------------------
# Hyperparameter optimization
# ----------------------------

study = optuna.create_study(direction="maximize") # maximize F1-score
study.optimize(objectiveCNNBERT, n_trials=20) # 20 trials for demonstration

print("Best parameters:", study.best_params)

[I 2025-10-22 11:49:25,543] A new study created in memory with name: no-name-ace195d6-6aed-43c3-8ec8-6d1dc2a0bd09
2025-10-22 11:49:26.540116: I external/local_xla/xla/service/service.cc:163] XLA service 0x7980a4005340 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-10-22 11:49:26.540156: I external/local_xla/xla/service/service.cc:171]   StreamExecutor device (0): Tesla V100S-PCIE-32GB, Compute Capability 7.0
2025-10-22 11:49:26.582372: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-10-22 11:49:26.735743: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91002
I0000 00:00:1761126567.852105 2988730 device_compiler.h:196] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2025-10-22 11:49:28.702946: E external/local_xla/xla/stream_executor/cuda/cuda_time

[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 291ms/step

2025-10-22 11:49:39.538043: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:39.802049: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 344ms/step


[I 2025-10-22 11:49:40,163] Trial 0 finished with value: 0.775 and parameters: {'cnn_filters': 64, 'kernel_size': 5, 'dense_units': 64, 'learning_rate': 0.0001}. Best is trial 0 with value: 0.775.
2025-10-22 11:49:43.019069: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:43.289064: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:45.252198: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:45.523739: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay 

[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 306ms/step

2025-10-22 11:49:47.904452: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:48.178138: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 346ms/step


[I 2025-10-22 11:49:48,533] Trial 1 finished with value: 0.4383561643835616 and parameters: {'cnn_filters': 128, 'kernel_size': 5, 'dense_units': 32, 'learning_rate': 0.01}. Best is trial 0 with value: 0.775.
2025-10-22 11:49:51.383464: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:51.653657: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:53.580617: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:49:53.851587: E external/local_xla/xla/stream_executor/cuda/cuda_timer.c

[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 276ms/step

2025-10-22 11:49:59.986695: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:50:00.260549: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 340ms/step


[I 2025-10-22 11:50:00,600] Trial 2 finished with value: 0.9069767441860465 and parameters: {'cnn_filters': 128, 'kernel_size': 4, 'dense_units': 16, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 112ms/step


[I 2025-10-22 11:50:32,115] Trial 3 finished with value: 0.6571428571428571 and parameters: {'cnn_filters': 96, 'kernel_size': 3, 'dense_units': 64, 'learning_rate': 1e-05}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step


[I 2025-10-22 11:50:44,301] Trial 4 finished with value: 0.8 and parameters: {'cnn_filters': 64, 'kernel_size': 5, 'dense_units': 16, 'learning_rate': 0.0001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 113ms/step


[I 2025-10-22 11:51:15,898] Trial 5 finished with value: 0.676056338028169 and parameters: {'cnn_filters': 96, 'kernel_size': 4, 'dense_units': 16, 'learning_rate': 1e-05}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 94ms/step


[I 2025-10-22 11:51:45,585] Trial 6 finished with value: 0.7123287671232876 and parameters: {'cnn_filters': 96, 'kernel_size': 4, 'dense_units': 16, 'learning_rate': 1e-05}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 96ms/step


[I 2025-10-22 11:52:12,646] Trial 7 finished with value: 0.810126582278481 and parameters: {'cnn_filters': 128, 'kernel_size': 4, 'dense_units': 32, 'learning_rate': 1e-05}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 117ms/step


[I 2025-10-22 11:52:20,420] Trial 8 finished with value: 0.8735632183908046 and parameters: {'cnn_filters': 128, 'kernel_size': 3, 'dense_units': 64, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.
2025-10-22 11:52:23.219573: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:52:23.478125: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:52:25.319693: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:52:25.572827: E external/local_xla/xla/stream_executor/cud

[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 265ms/step

2025-10-22 11:52:53.704019: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.
2025-10-22 11:52:53.964516: E external/local_xla/xla/stream_executor/cuda/cuda_timer.cc:86] Delay kernel timed out: measured time has sub-optimal accuracy. There may be a missing warmup execution, please investigate in Nsight Systems.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 329ms/step


[I 2025-10-22 11:52:54,287] Trial 9 finished with value: 0.6470588235294118 and parameters: {'cnn_filters': 64, 'kernel_size': 4, 'dense_units': 16, 'learning_rate': 1e-05}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 95ms/step


[I 2025-10-22 11:53:01,165] Trial 10 finished with value: 0.5974025974025974 and parameters: {'cnn_filters': 128, 'kernel_size': 4, 'dense_units': 16, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 107ms/step


[I 2025-10-22 11:53:09,082] Trial 11 finished with value: 0.8764044943820225 and parameters: {'cnn_filters': 128, 'kernel_size': 3, 'dense_units': 64, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 99ms/step


[I 2025-10-22 11:53:17,162] Trial 12 finished with value: 0.8470588235294118 and parameters: {'cnn_filters': 128, 'kernel_size': 3, 'dense_units': 64, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step


[I 2025-10-22 11:53:24,624] Trial 13 finished with value: 0.8627450980392157 and parameters: {'cnn_filters': 128, 'kernel_size': 3, 'dense_units': 64, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 93ms/step


[I 2025-10-22 11:53:33,163] Trial 14 finished with value: 0.8354430379746836 and parameters: {'cnn_filters': 128, 'kernel_size': 3, 'dense_units': 32, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 93ms/step


[I 2025-10-22 11:53:42,261] Trial 15 finished with value: 0.0 and parameters: {'cnn_filters': 128, 'kernel_size': 4, 'dense_units': 16, 'learning_rate': 0.01}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 96ms/step


[I 2025-10-22 11:53:50,842] Trial 16 finished with value: 0.8433734939759037 and parameters: {'cnn_filters': 128, 'kernel_size': 3, 'dense_units': 64, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 97ms/step


[I 2025-10-22 11:53:57,673] Trial 17 finished with value: 0.8157894736842105 and parameters: {'cnn_filters': 128, 'kernel_size': 4, 'dense_units': 64, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 97ms/step


[I 2025-10-22 11:54:07,377] Trial 18 finished with value: 0.8571428571428571 and parameters: {'cnn_filters': 96, 'kernel_size': 3, 'dense_units': 16, 'learning_rate': 0.001}. Best is trial 2 with value: 0.9069767441860465.


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step


[I 2025-10-22 11:54:17,157] Trial 19 finished with value: 0.7297297297297297 and parameters: {'cnn_filters': 64, 'kernel_size': 5, 'dense_units': 32, 'learning_rate': 0.0001}. Best is trial 2 with value: 0.9069767441860465.


Best parameters: {'cnn_filters': 128, 'kernel_size': 4, 'dense_units': 16, 'learning_rate': 0.001}


In [17]:
# -----------
# Final training and evaluation
# -----------

best_params = study.best_params
best_model = Sequential([
    Input(shape=(max_len, X_train_emb.shape[2])),
    Conv1D(filters=best_params["cnn_filters"], kernel_size=best_params["kernel_size"], activation='relu'),
    GlobalMaxPooling1D(),
    Dense(best_params["dense_units"], activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
best_model.compile(
    optimizer=Adam(learning_rate=best_params["learning_rate"]),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
best_model.fit(
    np.concatenate((X_train_emb, X_val_emb)), 
    np.concatenate((y_train, y_val)),
    epochs=50,
    batch_size=8,
    callbacks=[EarlyStopping(monitor='loss', patience=2, restore_best_weights=True, verbose=0)],
    verbose=0
)

y_pred = (best_model.predict(X_test_emb) > 0.5).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Weighted F1-score:", f1_score(y_test, y_pred, average="weighted"))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 97ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95        68
           1       0.95      0.88      0.92        43

    accuracy                           0.94       111
   macro avg       0.94      0.93      0.93       111
weighted avg       0.94      0.94      0.94       111

Confusion Matrix:
[[66  2]
 [ 5 38]]
Weighted F1-score: 0.9364777796779183


# **BERT**

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import optuna
import gc

  from .autonotebook import tqdm as notebook_tqdm
2025-10-22 15:48:37.435721: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-22 15:48:37.493556: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-22 15:48:38.771766: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [None]:
# ----------------
# Load dataset
# ----------------

path_file = "datasets/Unipi_NDF/df_ndf.csv"
df = pd.read_csv(path_file, sep="\t", encoding="utf-8")
df.head()

texts = df["texts"].astype(str).tolist()
labels = df["labels"].values

In [4]:
# ----------------
# Load dataset
# ----------------

path_file = "datasets/Unipi_NDF/df_ndf.csv"
df = pd.read_csv(path_file, sep="\t", encoding="utf-8")
df.head()

texts = df["texts"].astype(str).tolist()
labels = df["labels"].values

In [5]:
# ------------------------------
# Preprocessing and tokenization
# ------------------------------

# split train/validation/test (0.6 / 0.2 / 0.2): stratify to maintain label distribution
X_train, X_temp, y_train, y_temp = train_test_split(texts, labels, test_size=0.4, stratify=labels, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
y_train = y_train.astype('int64')
y_val = y_val.astype('int64')
y_test = y_test.astype('int64')
print("Train labels:", set(y_train))
print("Val labels:", set(y_val))
print("Test labels:", set(y_test))


max_len = 128

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

def tokenize_texts(texts):
    """ Tokenize a list of texts using BERT tokenizer. """
    return tokenizer(
        texts, truncation=True, padding=True, max_length=max_len
    )

train_enc = tokenize_texts(X_train)
val_enc = tokenize_texts(X_val)
test_enc = tokenize_texts(X_test)

train_enc = Dataset.from_dict({
    'input_ids': train_enc['input_ids'],
    'attention_mask': train_enc['attention_mask'],
    'labels': y_train
})

val_enc = Dataset.from_dict({
    'input_ids': val_enc['input_ids'],
    'attention_mask': val_enc['attention_mask'],
    'labels': y_val
})

test_enc = Dataset.from_dict({
    'input_ids': test_enc['input_ids'],
    'attention_mask': test_enc['attention_mask'],
    'labels': y_test
})

Train labels: {0, 1}
Val labels: {0, 1}
Test labels: {0, 1}


In [6]:
# ----------------------------
# Evaluation metrics
# ----------------------------

def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    f1 = f1_score(pred.label_ids, preds)
    return {"f1": f1}

In [13]:
# ----------------------------
# Optuna objective function
# ----------------------------

def objectiveBERT(trial):
    learning_rate = trial.suggest_categorical("learning_rate", [4e-5, 2e-5, 3e-2])
    weight_decay = trial.suggest_categorical("weight_decay", [0.001, 0.01, 0.1, None])

    model = BertForSequenceClassification.from_pretrained(
            "bert-base-cased",
            num_labels=2
        )
    model.to(device)
    
    training_args = TrainingArguments(
        output_dir=f"./bert_fake_news_{trial.number}",
        eval_strategy="epoch", # Changed from evaluation_strategy
        save_strategy="epoch",
        learning_rate=learning_rate,
        weight_decay=weight_decay if weight_decay is not None else 0.0,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        num_train_epochs=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        logging_dir=f"./logs_bert_fake_news_{trial.number}",
        logging_strategy="epoch",
        save_total_limit=2,
        disable_tqdm=True,
        report_to="none",
        remove_unused_columns=False,
        fp16=False,                     # mixed precision training (meno memoria, più veloce)
        gradient_accumulation_steps=2, # riduce batch effettivo in GPU
        dataloader_num_workers=2,      # numero di worker per il caricamento dei dati
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_enc,
        eval_dataset=val_enc,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    metrics = trainer.evaluate()

    # Libera memoria GPU
    del trainer
    del model
    torch.cuda.empty_cache()
    gc.collect()

    return metrics["eval_f1"]

In [14]:
batch = {k: torch.tensor(v[:2]) for k,v in train_enc[:2].items()}
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
outputs = model(**batch)
print("Forward pass OK")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Forward pass OK


In [15]:
# ----------------------------
# Hyperparameter optimization
# ----------------------------

study = optuna.create_study(direction="maximize") # maximize F1-score
study.optimize(objectiveBERT, n_trials=5) # 5 trials for demonstration; increase for better results

print("Best parameters:", study.best_params)

[I 2025-10-22 15:49:41,903] A new study created in memory with name: no-name-ca733b52-b840-4860-8a5f-3e505e6b0e1a
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[W 2025-10-22 15:49:42,250] Trial 0 failed with parameters: {'learning_rate': 0.03, 'weight_decay': None} because of the following error: AcceleratorError('CUDA error: device-side assert triggered\nCUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.\nFor debugging consider passing CUDA_LAUNCH_BLOCKING=1\nCompile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.\n').
Traceback (most recent call last):
  File "/home/n.emmolo/miniconda3/envs/env/lib/python3.10/site-packages/optuna/study/_optimize.py", line 201, in 

AcceleratorError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# ----------------------------
# Final training and evaluation
# ----------------------------

best_params = study.best_params
best_model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=2
)
best_training_args = TrainingArguments(
    output_dir="./bert_fake_news_final",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=best_params["learning_rate"],
    weight_decay=best_params["weight_decay"] if best_params["weight_decay"] is not None else 0.0,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs_bert_fake_news_final",
    logging_strategy="epoch",
    save_total_limit=2,
    fp16=False,
    gradient_accumulation_steps=2,
    dataloader_num_workers=2,
)
best_trainer = Trainer(
    model=best_model,
    args=best_training_args,
    train_dataset=train_enc,
    eval_dataset=val_enc,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
best_trainer.train()

y_pred = np.argmax(best_trainer.predict(test_enc).predictions, axis=1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Weighted F1-score:", f1_score(y_test, y_pred, average="weighted"))

# **DeBERTa**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (
    DebertaTokenizer,
    DebertaForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
# ----------------
# Load dataset
# ----------------

path_file = "/content/df_ndf.csv"
df = pd.read_csv(path_file, sep="\t", encoding="utf-8")
df.head()

texts = df["texts"].astype(str).tolist()
labels = df["labels"].values



# Codifica etichette
encoder = LabelEncoder()
df["labels"] = encoder.fit_transform(df["labels"])

# Split train/test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["texts"].tolist(),
    df["labels"].tolist(),
    test_size=0.2,
    stratify=df["labels"],
    random_state=42,
)

In [None]:
# -----------------------------
# Tokenizzazione con DeBERTa
# -----------------------------
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")

max_len = 256  # coerente con fine-tuning BERT nel paper
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_len)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_len)


In [None]:
# -----------------------------------
# Conversione in Dataset Hugging Face
# -----------------------------------
train_dataset = Dataset.from_dict(
    {
        "input_ids": train_encodings["input_ids"],
        "attention_mask": train_encodings["attention_mask"],
        "labels": train_labels,
    }
)

test_dataset = Dataset.from_dict(
    {
        "input_ids": test_encodings["input_ids"],
        "attention_mask": test_encodings["attention_mask"],
        "labels": test_labels,
    }
)

In [None]:
# ---------------------------
# Caricamento modello DeBERTa
# ---------------------------
model = DebertaForSequenceClassification.from_pretrained(
    "microsoft/deberta-base",
    num_labels=2  # classificazione binaria
)

In [None]:
# -----------------------
# Metriche di valutazione
# -----------------------
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted"
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [None]:
# --------------------------------------
# Impostazioni di training (Trainer API)
# --------------------------------------
training_args = TrainingArguments(
    output_dir="./deberta_fake_news",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
)

In [None]:
# -------------------------
# Addestramento con Trainer
# -------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
# -----------
# Valutazione
# -----------
metrics = trainer.evaluate()
print("\n=== Risultati Finali (DeBERTa) ===")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")