# Import Libraries

In [None]:
from datasets import IterableDataset
from google.colab import drive
import json
import librosa
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score, confusion_matrix
import seaborn as sns
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import Wav2Vec2FeatureExtractor, HubertModel,Wav2Vec2Processor
from tqdm import tqdm
import zipfile
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from math import ceil
from transformers import TrainerCallback
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score, confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from transformers import AutoModelForAudioClassification

In [None]:
drive.mount('/content/drive')

# Import Audio Files

In [None]:
!mkdir -p segments_all
!unzip -j -o "Add satire audio path" -d segments_all
!unzip -j -o "Add non satire audio path" -d segments_all

# Prepare Data

In [None]:
def load_and_prepare_data(split_path: str, audio_dir: str = "segments_all"):
    """
    Loads train/validation/test CSV files from split_path,
    maps labels to integer IDs, and adds a 'path' column with audio file paths.

    Args:
        split_path (str): Path to the folder containing train.csv, validation.csv, and test.csv
        audio_dir (str): Name of the folder containing audio files (default: 'segments_all')

    Returns:
        tuple: (df_train, df_val, df_test, label2id, id2label)
    """
    # CSV file paths
    train_file = os.path.join(split_path, "train.csv")
    val_file = os.path.join(split_path, "validation.csv")
    test_file = os.path.join(split_path, "test.csv")

    # Read CSV files
    df_train = pd.read_csv(train_file)
    df_val = pd.read_csv(val_file)
    df_test = pd.read_csv(test_file)

    # Create label <-> ID mapping
    labels = sorted(df_train["label"].unique().tolist())
    id2label = {i: label for i, label in enumerate(labels)}
    label2id = {label: i for i, label in enumerate(labels)}

    print("id2label:", id2label)
    print("label2id:", label2id)

    # Map labels in the DataFrames and add audio file paths
    for df in [df_train, df_val, df_test]:
        df["label"] = df["label"].map(label2id)
        df["path"] = df.apply(
            lambda row: f"{audio_dir}/{row['id']}-{int(row['segment_id'])}.mp3", axis=1
        )

    return df_train, df_val, df_test, labels

# Dataset

In [None]:
from datasets import Dataset

def lazy_dataset_generator(df, processor):
    for idx, row in df.iterrows():

        # Load audio file
        speech_array, _ = librosa.load(row["path"], sr=16000)

        # Trim to the first 10 seconds
        max_samples = int(10 * 16000)
        speech_array = speech_array[:max_samples]

        # Pad if shorter than max_samples
        if len(speech_array) < max_samples:
            speech_array = np.pad(speech_array, (0, max_samples - len(speech_array)))

        # Encode audio with HuBERT processor
        audio_enc = processor(
            speech_array,
            sampling_rate=16000,
            padding="max_length",
            max_length=max_samples,
            return_tensors="pt"
        )

        # Yield dictionary with input values, attention mask, and label
        yield {
            "input_values": audio_enc.input_values.squeeze(0),
            "audio_attention_mask": audio_enc.attention_mask.squeeze(0),
            "labels": int(row["label"])
        }

# Training

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

In [None]:
class EpochLoggerCallback(TrainerCallback):
    def __init__(self, steps_per_epoch):
        self.steps_per_epoch = steps_per_epoch
        self.current_step = 0

    def on_step_end(self, args, state, control, **kwargs):
        self.current_step += 1
        if self.current_step % self.steps_per_epoch == 0:
            print(f"=== Fine epoca {self.current_step // self.steps_per_epoch} ===")

In [None]:
def train_hf_model(model, datasets, max_steps, steps_per_epoch, output_dir='./results',
                   num_labels=2, train_batch_size=16, eval_batch_size=16,
                   num_train_epochs=3, learning_rate=2e-5,
                   eval_strategy="epoch", save_strategy="epoch",
                   metric_for_best_model="eval_f1", greater_is_better=True, compute_metrics=compute_metrics):
    """
    Generic function to train a HuggingFace model using Trainer.

    Args:
        model: HuggingFace model (e.g., AutoModelForSequenceClassification).
        datasets (dict): Dictionary with keys 'train', 'val' (and optionally 'test') containing HuggingFace Datasets.
        output_dir (str): Output folder for results and checkpoints.
        num_labels (int): Number of classes.
        train_batch_size (int): Training batch size.
        eval_batch_size (int): Evaluation batch size.
        num_train_epochs (int): Number of epochs.
        learning_rate (float): Learning rate.
        eval_strategy (str): Evaluation strategy ("steps" or "epoch").
        save_strategy (str): Checkpoint saving strategy ("steps" or "epoch").
        metric_for_best_model (str): Metric to select the best model.
        greater_is_better (bool): True if higher metric values are better.
        early_stopping_patience (int): Number of epochs without improvement before stopping.
        compute_metrics (func): Function to compute evaluation metrics.

    Returns:
        trainer: Trained HuggingFace Trainer object.
        eval_results: Dictionary of evaluation results.
    """

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=eval_batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=learning_rate,
        eval_strategy=eval_strategy,
        save_strategy=save_strategy,
        load_best_model_at_end=True,
        metric_for_best_model=metric_for_best_model,
        greater_is_better=greater_is_better,
        max_steps=max_steps,
        fp16=True,
    )

    # Initialize HuggingFace Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=datasets['train'],
        eval_dataset=datasets['val'],
        compute_metrics=compute_metrics,
        callbacks=[EpochLoggerCallback(steps_per_epoch)]
    )

    # Start training
    trainer.train()

    # Evaluate the trained model
    eval_results = trainer.evaluate()

    print(f"Training completed successfully!")
    print(f"Evaluation results: {eval_results}")

    return trainer, eval_results

# Evaluation

In [None]:
def test_model(model, dataset_test, class_names=None, batch_size=32, average='macro'):
    """
    Esegue l'inferenza del modello sul dataset di test e calcola le metriche di valutazione.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    dataloader_test = DataLoader(
    dataset_test,
    batch_size=batch_size,
    drop_last=False,
    num_workers=2,
    pin_memory=True
    )


    all_logits = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader_test:
            input_values = batch['input_values'].to(device)
            attention_mask = batch['audio_attention_mask'].to(device)
            labels = batch['labels']

            outputs = model(
                input_values=input_values,
                attention_mask=attention_mask
            )

            logits = outputs.logits

            all_logits.append(logits.cpu())
            all_labels.append(labels.cpu())

    all_logits = torch.cat(all_logits, dim=0)
    true_labels = torch.cat(all_labels, dim=0).numpy()

    probabilities = F.softmax(all_logits, dim=1)
    predicted_labels = torch.argmax(probabilities, dim=1).numpy()

    # Metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, average=average)
    precision = precision_score(true_labels, predicted_labels, average=average)
    recall = recall_score(true_labels, predicted_labels, average=average)
    kappa = cohen_kappa_score(true_labels, predicted_labels)
    cm = confusion_matrix(true_labels, predicted_labels)

    if class_names is None:
        class_names = [str(i) for i in range(cm.shape[0])]
    num_per_class = dict(zip(class_names, np.sum(cm, axis=1)))

    # Print Results
    print("\n=== Test Results ===")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"F1 Score ({average}): {f1:.4f}")
    print(f"Precision ({average}): {precision:.4f}")
    print(f"Recall ({average}): {recall:.4f}")
    print(f"Cohen Kappa: {kappa:.4f}")
    print("\nClassification Report:")
    print(classification_report(true_labels, predicted_labels, target_names=class_names))

    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.show()

    # Final outputs
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "cohen_kappa": kappa,
        "true_labels": true_labels,
        "predicted_labels": predicted_labels,
        "probabilities": probabilities.numpy(),
        "confusion_matrix": cm,
        "num_per_class": num_per_class,
        "class_names": class_names
    }

# Saving Results

In [None]:
def save_test_results(test_results, save_path):
    """
    Salva i risultati di test in JSON e i vettori numerici in file .npy.
    Gestisce automaticamente tipi NumPy non serializzabili.
    """
    os.makedirs(save_path, exist_ok=True)
    
    array_keys = [
        'true_labels',
        'predicted_labels',
        'probabilities',
        'confusion_matrix',
        'num_per_class'
    ]

    def make_serializable(obj):
        if isinstance(obj, (np.integer, np.int32, np.int64)):
            return int(obj)
        elif isinstance(obj, (np.floating, np.float32, np.float64)):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, (set, tuple)):
            return list(obj)
        else:
            return obj

    json_results = {k: make_serializable(v) for k, v in test_results.items() if k not in array_keys}

    json_file = os.path.join(save_path, 'test_results.json')
    with open(json_file, 'w') as f:
        json.dump(json_results, f, indent=4)
    
    for key in array_keys:
        if key in test_results:
            np.save(os.path.join(save_path, f"{key}.npy"), test_results[key])

    print(f"Results saved in: {save_path}")

# Main

In [None]:
base_path = "Add cross validation's path"
base_save_path = 'Add saving path'

num_classes = 2
num_splits = 10
batch_size = 16
num_train_epochs = 3
num_labels = 2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")

# Loop over all splits 
for i in range(5, 6):
    print(f"\n=== Processing Cross {i} ===")
    split_path = os.path.join(base_path, f'Cross {i}')

    # Read CSV files and map labels to integers 
    df_train, df_val, df_test, labels = load_and_prepare_data(split_path)

    # Lazy loading datasets
    train_dataset = IterableDataset.from_generator(
        lambda: lazy_dataset_generator(df_train, processor)
    )

    val_dataset = IterableDataset.from_generator(
        lambda: lazy_dataset_generator(df_val, processor)
    )

    test_dataset = IterableDataset.from_generator(
        lambda: lazy_dataset_generator(df_test, processor)
    )

    num_train_examples = len(df_train)
    steps_per_epoch = ceil(len(df_train) / batch_size)
    max_steps = steps_per_epoch * num_train_epochs

    # Load model 
    model = AutoModelForAudioClassification.from_pretrained(
        "facebook/hubert-large-ls960-ft",
        num_labels=2,
    )

    model.hubert.gradient_checkpointing_enable()  # Reduce memory usage
    model.to(device)

    # Train model 
    trainer, eval_results = train_hf_model(
        model=model,
        datasets={"train": train_dataset, "val": val_dataset},
        train_batch_size=num_train_epochs,
        steps_per_epoch=steps_per_epoch,
        max_steps=max_steps,
        num_labels=num_labels,
        output_dir=os.path.join(base_save_path, f'Cross {i}'),
    )

    # Evaluation 
    print(f"\n--- Test Evaluation (Cross {i}) ---")
    test_results = test_model(
        model=model,
        dataset_test=test_dataset,
        class_names=labels,
        batch_size=batch_size
    )

    # Save results 
    save_path = os.path.join(base_save_path, f'Cross {i}')
    os.makedirs(save_path, exist_ok=True)

    torch.cuda.empty_cache()  # Clear GPU memory





# Cross validation's validation with external Dataset

In [None]:
# Global variables
base_path = "Add cross validation's path"  # Path to CSV splits
base_save_path = 'Add saving path'        # Path to save trained models and results

num_classes = 2
num_splits = 10
batch_size = 16
num_train_epochs = 3
num_labels = 2
i = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")

# Training
print("Processing")
split_path = os.path.join(base_path, f'Cross {1}')

# Read CSV files and map labels to integers
df_train, df_val, df_test, labels = load_and_prepare_data(split_path)

# Concatenate all splits and shuffle
df_all = pd.concat([df_train, df_val, df_test])
df_all = df_all.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into training and validation sets
df_train, df_val = train_test_split(df_all, test_size=0.2, random_state=42)

# Lazy loading datasets
train_dataset = IterableDataset.from_generator(
    lambda: lazy_dataset_generator(df_train, processor)
)

val_dataset = IterableDataset.from_generator(
    lambda: lazy_dataset_generator(df_val, processor)
)

num_train_examples = len(df_train)
steps_per_epoch = ceil(len(df_train) / batch_size)
max_steps = steps_per_epoch * num_train_epochs

# Load pre-trained HuBERT model
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/hubert-large-ls960-ft",
    num_labels=2,
)

model.hubert.gradient_checkpointing_enable()  # Reduce memory usage
model.to(device)

# Train the model
trainer, eval_results = train_hf_model(
    model=model,
    datasets={"train": train_dataset, "val": val_dataset},
    train_batch_size=num_train_epochs,
    steps_per_epoch=steps_per_epoch,
    max_steps=max_steps,
    num_labels=num_labels,
    output_dir=os.path.join(base_save_path, f'Cross {i}'),
)

torch.cuda.empty_cache()  # Clear GPU memory

In [None]:
dataset_esterno_path = "Add external dataset's path"  # Path to external dataset
base_save_path = "Add external dataset's result path"  # Path to save results

# CSV file
external_file = os.path.join(dataset_esterno_path, 'dataset_2k.csv')

# Load CSV
df_external = pd.read_csv(external_file)

# Create label <-> ID mapping
labels = sorted(df_external["label"].unique().tolist())
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

# Map labels in the DataFrame and add audio file paths
df_external['label'] = df_external['label'].map(label2id)
df_external["path"] = df_external.apply(
    lambda row: f"segments_all/{row['id']}-{int(row['segment_id'])}.mp3", axis=1
)

# Lazy loading dataset
external_dataset = IterableDataset.from_generator(
    lambda: lazy_dataset_generator(df_external, processor)
)

# Evaluation
print(f"\n--- External Evaluation ---")
test_results = test_model(
    model=model,
    dataset_test=external_dataset,
    class_names=labels,
    batch_size=batch_size
)

# Save results
save_path = os.path.join(base_save_path, f'Cross {i}')
save_test_results(test_results, save_path)
