# Import libraries

In [None]:
from datasets import DatasetDict
from google.colab import drive
import gc
import json
import librosa
from math import ceil
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score, confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
import seaborn as sns
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import Wav2Vec2Processor, HubertModel
from transformers import TrainerCallback
from tqdm import tqdm
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, AutoTokenizer, AutoModelForSequenceClassification, AutoModel,get_cosine_schedule_with_warmup
import zipfile

In [None]:
drive.mount('/content/drive')

# Import Audio Files

In [None]:
!mkdir -p segments_all
!unzip -j -o "Add satire audio path" -d segments_all
!unzip -j -o "Add non satire audio path" -d segments_all

# Load data

In [None]:
def load_and_prepare_data(split_path: str, audio_dir: str = "segments_all"):
    """
    Loads train/validation/test CSV files from the given split path,
    maps string labels to integer IDs, and adds a 'path' column with audio file paths.

    Args:
        split_path (str): Path to the folder containing train.csv, validation.csv, and test.csv.
        audio_dir (str): Name of the directory containing the audio files (default: 'segments_all').

    Returns:
        tuple: (df_train, df_val, df_test, label2id, id2label, labels)
    """
    # CSV file paths
    train_file = os.path.join(split_path, "train.csv")
    val_file = os.path.join(split_path, "validation.csv")
    test_file = os.path.join(split_path, "test.csv")

    # Read CSV files
    df_train = pd.read_csv(train_file)
    df_val = pd.read_csv(val_file)
    df_test = pd.read_csv(test_file)

    # Create label <-> id mappings
    labels = sorted(df_train["label"].unique().tolist())
    id2label = {i: label for i, label in enumerate(labels)}
    label2id = {label: i for i, label in enumerate(labels)}

    print("id2label:", id2label)
    print("label2id:", label2id)

    # Map labels and build audio file paths
    for df in [df_train, df_val, df_test]:
        df["label"] = df["label"].map(label2id)
        df["path"] = df.apply(
            lambda row: f"{audio_dir}/{row['id']}-{int(row['segment_id'])}.mp3",
            axis=1
        )

    return df_train, df_val, df_test, label2id, id2label, labels

# Multimodal dataset

In [None]:
from datasets import Dataset

def lazy_dataset_generator(df, tokenizer, processor,train=True):
    for idx, row in df.iterrows():
        text_enc = tokenizer(
            row["transcription"], padding="max_length", truncation=True, max_length=76, return_tensors="pt"
        )
        speech_array, _ = librosa.load(row["path"], sr=16000)
        max_samples = 16000 * 10  # 10 seconds at 16 kHz
        speech_array = speech_array[:max_samples]
        audio_enc = processor(
            speech_array, sampling_rate=16000, padding="max_length",
            max_length=16000*10, return_tensors="pt"
        )

        yield {
            "input_ids": text_enc["input_ids"].squeeze(0),
            "attention_mask": text_enc["attention_mask"].squeeze(0),
            "input_values": audio_enc.input_values.squeeze(0),
            "audio_attention_mask": audio_enc.attention_mask.squeeze(0),
            "labels": int(row["label"])
        }

# Metrics

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

# Models

In [None]:
class HuBERTEncoder(nn.Module):
    def __init__(self, model_name="facebook/hubert-base-ls960"):
        super().__init__()
        from transformers import AutoModel
        self.hubert = AutoModel.from_pretrained(model_name)

        for param in self.hubert.parameters():
            param.requires_grad = True

        self.output_dim = self.hubert.config.hidden_size

    def forward(self, input_values, attention_mask=None):
        outputs = self.hubert(input_values=input_values, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # [B, T_audio, hidden]
        return hidden_states  

In [None]:
class GatedFusion(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.gate = nn.Sequential(
            nn.Linear(dim * 2, dim),
            nn.ReLU(),
            nn.Linear(dim, 1),
            nn.Sigmoid()
        )

    def forward(self, text_vec, audio_vec):
        # Concatenate text and audio representations
        combined = torch.cat([text_vec, audio_vec], dim=-1)

        # Compute the gating coefficient alpha in the range [0, 1]
        alpha = self.gate(combined)

        # Dynamically fuse the two modalities using the gating mechanism
        fused = alpha * text_vec + (1 - alpha) * audio_vec

        return fused, alpha

In [None]:
class PositionalEncoding(nn.Module):
    #Sinusoidal positional encoding
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # [1, max_len, d_model]
        self.register_buffer("pe", pe)

    def forward(self, x):
        # x: [B, T, D]
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]

In [None]:
class MultiModalSatireClassifier(nn.Module):
    def __init__(self, text_model_name, fine_tune_layers=6, hidden_dim=256, num_classes=2, num_heads=8):
        super().__init__()
        # TEXT ENCODER 
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        text_hidden = self.text_encoder.config.hidden_size

        # AUDIO ENCODER 
        self.audio_encoder = HuBERTEncoder()
        audio_hidden = self.audio_encoder.output_dim

        # 1D pojection
        self.text_proj = nn.Conv1d(text_hidden, hidden_dim, kernel_size=3, padding=1) 
        self.audio_proj = nn.Conv1d(audio_hidden, hidden_dim, kernel_size=3, padding=1)

        # Positional encoding 
        self.text_pos = PositionalEncoding(hidden_dim)
        self.audio_pos = PositionalEncoding(hidden_dim)

        # Cross-modal attention 
        self.cross_attn_text_to_audio = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=num_heads, batch_first=True)
        self.cross_attn_audio_to_text = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=num_heads, batch_first=True)
        self.text_ln = nn.LayerNorm(hidden_dim)
        self.audio_ln = nn.LayerNorm(hidden_dim)

        # Global inter-modal attention 
        self.global_attn = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=num_heads, batch_first=True)
        self.global_ln = nn.LayerNorm(hidden_dim)


        # Gated Fusion Layer 
        self.gate = nn.Linear(hidden_dim * 2, hidden_dim)

        # Classification Head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.LayerNorm(hidden_dim),
            nn.Dropout(0.4),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.GELU(),
            nn.LayerNorm(hidden_dim // 2),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim // 2, num_classes)
        )


        self.loss_fn = nn.CrossEntropyLoss()


    def forward(self, input_ids, attention_mask, input_values, audio_attention_mask, labels=None):
        # TEXT 
        text_out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_feats = text_out.last_hidden_state                 # [B, T_text, H]
        text_feats = text_feats.transpose(1, 2)                # [B, H, T_text]  
        text_feats = self.text_proj(text_feats)                # [B, hidden_dim, T_text]
        text_feats = text_feats.transpose(1, 2)               # [B, T_text, hidden_dim]
        text_feats = self.text_pos(text_feats)                # Positional encoding

        # AUDIO 
        audio_feats = self.audio_encoder(input_values)         # [B, T_audio, H_audio]
        audio_feats = audio_feats.transpose(1, 2)             # [B, H_audio, T_audio]
        audio_feats = self.audio_proj(audio_feats)            # [B, hidden_dim, T_audio]
        audio_feats = audio_feats.transpose(1, 2)            # [B, T_audio, hidden_dim]
        audio_feats = self.audio_pos(audio_feats)             # Positional encoding

        # Mask
        text_kpm = attention_mask == 0                         # True = PAD
        if audio_attention_mask is not None:
            audio_mask_float = (~audio_attention_mask).float().unsqueeze(1)  # [B,1,T_audio]
            audio_mask_down = F.interpolate(audio_mask_float, size=audio_feats.size(1), mode='nearest')
            audio_kpm = ~(audio_mask_down.squeeze(1).bool())
        else:
            audio_kpm = None

        # Cross-modal attention 
        text_attn_out, _ = self.cross_attn_text_to_audio(
            query=text_feats, key=audio_feats, value=audio_feats, key_padding_mask=audio_kpm
        )
        audio_attn_out, _ = self.cross_attn_audio_to_text(
            query=audio_feats, key=text_feats, value=text_feats, key_padding_mask=text_kpm
        )
        text_feats = self.text_ln(text_feats + text_attn_out)
        audio_feats = self.audio_ln(audio_feats + audio_attn_out)

        # Pooling
        text_mask = ~text_kpm if text_kpm is not None else torch.ones(text_feats.size()[:2], dtype=torch.bool, device=text_feats.device)
        audio_mask = ~audio_kpm if audio_kpm is not None else torch.ones(audio_feats.size()[:2], dtype=torch.bool, device=audio_feats.device)

        text_vec = (text_feats * text_mask.unsqueeze(-1)).sum(dim=1) / text_mask.sum(dim=1, keepdim=True)
        audio_vec = (audio_feats * audio_mask.unsqueeze(-1)).sum(dim=1) / audio_mask.sum(dim=1, keepdim=True)

        # Dropout 
        text_vec = F.dropout(text_vec, p=0.2, training=self.training)
        audio_vec = F.dropout(audio_vec, p=0.2, training=self.training)

        # Global attention 
        global_input = torch.cat([text_vec.unsqueeze(1), audio_vec.unsqueeze(1)], dim=1)  # [B,2,H]
        global_attn_out, _ = self.global_attn(global_input, global_input, global_input)
        global_attn_out = self.global_ln(global_attn_out)
        global_attn_out = F.dropout(global_attn_out, p=0.2, training=self.training)

        text_vec = global_attn_out[:, 0, :]
        audio_vec = global_attn_out[:, 1, :]

        # Gated fusion
        fusion_input = torch.cat([text_vec, audio_vec], dim=-1)
        alpha = torch.sigmoid(self.gate(fusion_input))
        fused_vec = alpha * text_vec + (1 - alpha) * audio_vec

        # Classification
        logits = self.classifier(fused_vec)

        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}


# Training

In [None]:
class EpochLoggerCallback(TrainerCallback):
    def __init__(self, steps_per_epoch):
        self.steps_per_epoch = steps_per_epoch
        self.current_step = 0

    def on_step_end(self, args, state, control, **kwargs):
        self.current_step += 1
        if self.current_step % self.steps_per_epoch == 0:
            print(f" End epoch {self.current_step // self.steps_per_epoch} ")

In [None]:
def train_hf_model(model, datasets, max_steps, steps_per_epoch, output_dir='./results',
                   num_labels=2, train_batch_size=16, eval_batch_size=16,
                   num_train_epochs=4, learning_rate=2e-5,
                   eval_strategy="epoch", save_strategy="epoch",
                   metric_for_best_model="eval_f1", greater_is_better=True,
                   compute_metrics=compute_metrics):
    """
    Generic function to train a Hugging Face model using the Trainer API.

    Args:
        model: Hugging Face model (e.g., AutoModelForSequenceClassification or AutoModelForAudioClassification).
        datasets (dict): Dictionary containing 'train' and 'val' Hugging Face Datasets.
        max_steps (int): Maximum number of training steps.
        steps_per_epoch (int): Number of steps per epoch.
        output_dir (str): Output directory for checkpoints and results.
        num_labels (int): Number of target classes.
        train_batch_size (int): Training batch size per device.
        eval_batch_size (int): Evaluation batch size per device.
        num_train_epochs (int): Number of training epochs.
        learning_rate (float): Learning rate.
        eval_strategy (str): Evaluation strategy ("steps" or "epoch").
        save_strategy (str): Checkpoint saving strategy ("steps" or "epoch").
        metric_for_best_model (str): Metric used to select the best model.
        greater_is_better (bool): Whether higher metric values indicate better performance.
        compute_metrics (callable): Function used to compute evaluation metrics.

    Returns:
        trainer: Trained Hugging Face Trainer object.
        eval_results (dict): Dictionary containing evaluation results.
    """

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        eval_strategy=eval_strategy,
        save_strategy=save_strategy,
        load_best_model_at_end=True,
        metric_for_best_model=metric_for_best_model,
        greater_is_better=greater_is_better,
        max_steps=max_steps,
        fp16=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets['train'],
        eval_dataset=datasets['val'],
        compute_metrics=compute_metrics,
        callbacks=[EpochLoggerCallback(steps_per_epoch)]
    )

    # Start model training
    trainer.train()

    # Evaluate the best model on the validation set
    eval_results = trainer.evaluate()

    print("Training completed successfully!")
    print(f"Evaluation results: {eval_results}")

    return trainer, eval_results


# Test evaluation

In [None]:
def test_model(model, dataset_test, class_names=None, batch_size=32, average='macro'):
    """
    Runs model inference on the test dataset and computes evaluation metrics.

    Args:
        model: Trained PyTorch / Hugging Face model.
        dataset_test: PyTorch dataset returning model inputs and labels.
        class_names (list, optional): List of class names (e.g. ['No Satire', 'Satire']).
        batch_size (int): Batch size for inference.
        average (str): Averaging strategy for F1, precision and recall
                       ('macro', 'weighted', 'micro', etc.).

    Returns:
        dict: Dictionary containing evaluation metrics, predictions and auxiliary information.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    dataloader_test = DataLoader(
        dataset_test,
        batch_size=batch_size,
        drop_last=False,
        num_workers=2,
        pin_memory=True
    )

    all_logits = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader_test:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            input_values = batch['input_values'].to(device)
            audio_attention_mask = batch['audio_attention_mask'].to(device)
            labels = batch['labels']

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                input_values=input_values,
                audio_attention_mask=audio_attention_mask
            )

            # Extract logits from the model output
            logits = outputs["logits"]

            all_logits.append(logits.cpu())
            all_labels.append(labels.cpu())

    # Concatenate logits and labels from all batches
    all_logits = torch.cat(all_logits, dim=0)
    true_labels = torch.cat(all_labels, dim=0).numpy()

    # Compute class probabilities and predictions
    probabilities = F.softmax(all_logits, dim=1)
    predicted_labels = torch.argmax(probabilities, dim=1).numpy()

    # Compute evaluation metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, average=average)
    precision = precision_score(true_labels, predicted_labels, average=average)
    recall = recall_score(true_labels, predicted_labels, average=average)
    kappa = cohen_kappa_score(true_labels, predicted_labels)
    cm = confusion_matrix(true_labels, predicted_labels)

    if class_names is None:
        class_names = [str(i) for i in range(cm.shape[0])]

    num_per_class = dict(zip(class_names, np.sum(cm, axis=1)))

    # Print evaluation results
    print("\nTest Results")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score ({average}): {f1:.4f}")
    print(f"Precision ({average}): {precision:.4f}")
    print(f"Recall ({average}): {recall:.4f}")
    print(f"Cohen Kappa: {kappa:.4f}")
    print("\nClassification Report:")
    print(classification_report(true_labels, predicted_labels, target_names=class_names))

    # Plot confusion matrix
    plt.figure(figsize=(6, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=class_names,
        yticklabels=class_names
    )
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.show()

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "cohen_kappa": kappa,
        "true_labels": true_labels,
        "predicted_labels": predicted_labels,
        "probabilities": probabilities.numpy(),
        "confusion_matrix": cm,
        "num_per_class": num_per_class,
        "class_names": class_names
    }


# Save results

In [None]:
def save_test_results(test_results, save_path):
    
    """
    Saves test results to a JSON file and numerical arrays to .npy files.
    Automatically handles non-serializable NumPy data types.
    """

    os.makedirs(save_path, exist_ok=True)

    array_keys = [
        'true_labels',
        'predicted_labels',
        'probabilities',
        'confusion_matrix',
        'num_per_class'
    ]

    def make_serializable(obj):
        if isinstance(obj, (np.integer, np.int32, np.int64)):
            return int(obj)
        elif isinstance(obj, (np.floating, np.float32, np.float64)):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, (set, tuple)):
            return list(obj)
        else:
            return obj

    json_results = {k: make_serializable(v) for k, v in test_results.items() if k not in array_keys}

    json_file = os.path.join(save_path, 'test_results.json')
    with open(json_file, 'w') as f:
        json.dump(json_results, f, indent=4)

    for key in array_keys:
        if key in test_results:
            np.save(os.path.join(save_path, f"{key}.npy"), test_results[key])

    print(f"Results saved in: {save_path}")

# Main

In [None]:
from datasets import Dataset, DatasetDict
from datasets import IterableDataset

# Global variables
base_path = "Add cross validation's path"
base_save_path = 'Add saving path'
num_splits = 10
batch_size = 16
model_name = 'dbmdz/bert-base-italian-cased'
num_labels = 2
num_train_epochs = 4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")

for i in range(1, num_splits + 1):
    print(f"\n Processing Cross {i} ")
    split_path = os.path.join(base_path, f'Cross {i}')
    df_train, df_val, df_test, label2id, id2label, labels = load_and_prepare_data(split_path)


    num_train_examples = len(df_train)
    steps_per_epoch = ceil(len(df_train) / batch_size)
    max_steps = steps_per_epoch * num_train_epochs
    
    # Lazi generation of datasets
    train_dataset = IterableDataset.from_generator(lambda: lazy_dataset_generator(df_train, tokenizer, processor,train=True))
    val_dataset = IterableDataset.from_generator(lambda: lazy_dataset_generator(df_val, tokenizer, processor,train=False))
    test_dataset = IterableDataset.from_generator(lambda: lazy_dataset_generator(df_test, tokenizer, processor,train=False))
   
    # Model
    multimodal_model = MultiModalSatireClassifier(
        text_model_name=paths['bert_italian'],
        hidden_dim=256,
        num_classes=num_labels
     ).to(device)


   # Trainer
    trainer, eval_results = train_hf_model(
        model=multimodal_model,
        datasets={"train": train_dataset, "val": val_dataset},
        train_batch_size=num_train_epochs,
        steps_per_epoch=steps_per_epoch,
        num_train_epochs = 4,
        max_steps=max_steps,
        num_labels=num_labels,
        output_dir=os.path.join(base_save_path, f'Cross {i}'),
    )


    # Test
    test_results = test_model(
        model=multimodal_model,
        dataset_test=test_dataset,
        class_names=labels,
        batch_size=batch_size
    )

    save_test_results(test_results, os.path.join(base_save_path, f'Cross {i}'))

    # Release unused GPU memory
    del multimodal_model, trainer, eval_results, test_results
    del train_dataset, val_dataset, test_dataset
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    gc.collect()


# Cross-validation evaluation with external dataset

In [None]:
from datasets import Dataset, DatasetDict
from datasets import IterableDataset

# Global variables
base_path = "Add cross validation's path"
base_save_path = 'Add saving path'

num_splits = 10
batch_size = 16
model_name = 'dbmdz/bert-base-italian-cased'
num_labels = 2
num_train_epochs = 4
i=1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")

print(f"\n Processing ")

# Dataset
split_path = os.path.join(base_path, f'Cross {i}')
df_train, df_val, df_test, label2id, id2label, labels = load_and_prepare_data(split_path)
df_all= pd.concat([df_train, df_val, df_test])
df_all = df_all.sample(frac=1, random_state=42).reset_index(drop=True)
df_train, df_val = train_test_split(df_all, test_size=0.2, random_state=42)


num_train_examples = len(df_train)
steps_per_epoch = ceil(len(df_train) / batch_size)
max_steps = steps_per_epoch * num_train_epochs

train_dataset = IterableDataset.from_generator(lambda: lazy_dataset_generator(df_train, tokenizer, processor,train=True))
val_dataset = IterableDataset.from_generator(lambda: lazy_dataset_generator(df_val, tokenizer, processor,train=False))


multimodal_model = MultiModalSatireClassifier(
        text_model_name=paths['bert_italian'],
        hidden_dim=256,
        num_classes=num_labels
     ).to(device)


# Trainer
trainer, eval_results = train_hf_model(
        model=multimodal_model,
        datasets={"train": train_dataset, "val": val_dataset},
        train_batch_size=num_train_epochs,
        steps_per_epoch=steps_per_epoch,
        num_train_epochs = 4,
        max_steps=max_steps,
        num_labels=num_labels,
        output_dir=os.path.join(base_save_path, f'Cross {i}'),
    )

In [None]:
dataset_esterno_path = "Add external dataset's path"  # Path to external dataset
base_save_path = "Add external dataset's result path"  # Path to save results

audio_dir: str = "segments_all"

# CSV file
external_file= os.path.join( dataset_esterno_path, 'dataset_2k.csv')

df_external = pd.read_csv(external_file)

# Create label <-> ID mapping
df_external['label'] = df_external['label'].map(label2id)
df_external["path"] = df_external.apply(
            lambda row: f"{audio_dir}/{row['id']}-{int(row['segment_id'])}.mp3", axis=1
        )

external_dataset = IterableDataset.from_generator(lambda: lazy_dataset_generator(df_external, tokenizer, processor,train=False))

# Test
print(f"\n Test datasert external")
test_results = test_model(
        model=multimodal_model,
        dataset_test=external_dataset,
        class_names=labels,
        batch_size=32
    )

# Save results
save_path = os.path.join(base_save_path, f'Cross {i}')
save_test_results(test_results, save_path)