# Import Libraries

In [None]:
from google.colab import drive
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score, confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, cohen_kappa_score
import seaborn as sns

In [None]:
drive.mount('/content/drive')

# Evaluation Function

In [None]:
def evaluate_model(true_labels, pred_labels, class_names=None, title="Confusion Matrix"):
    """
    Computes and displays classification metrics and the confusion matrix.

    Args:
        true_labels (list or np.array): Ground-truth labels.
        pred_labels (list or np.array): Predicted labels.
        class_names (list, optional): Class names (e.g. ['no_satire', 'satire']).
        title (str): Title of the confusion matrix plot.
    """

    if class_names is None:
        class_names = sorted(list(set(true_labels)))

    # Compute global evaluation metrics
    acc = accuracy_score(true_labels, pred_labels)
    prec = precision_score(true_labels, pred_labels, average='weighted', zero_division=0)
    rec = recall_score(true_labels, pred_labels, average='weighted', zero_division=0)
    f1 = f1_score(true_labels, pred_labels, average='weighted', zero_division=0)
    kappa = cohen_kappa_score(true_labels, pred_labels)

    # Print evaluation metrics
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision (weighted): {prec:.4f}")
    print(f"Recall (weighted): {rec:.4f}")
    print(f"F1-score (weighted): {f1:.4f}")
    print(f"Cohen’s Kappa: {kappa:.4f}")

    # Print detailed per-class report
    print("\nClassification report:")
    print(
        classification_report(
            true_labels,
            pred_labels,
            target_names=class_names,
            zero_division=0
        )
    )

    # Plot confusion matrix
    cm = confusion_matrix(true_labels, pred_labels)
    plt.figure(figsize=(6, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=class_names,
        yticklabels=class_names
    )
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(title)
    plt.show()

    # Return computed metrics
    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "cohen_kappa": kappa
    }


# Save results

In [None]:
def save_test_results(true_labels, pred_labels, class_names, save_path):
    """
    Computes and saves evaluation metrics and the confusion matrix to disk.

    Args:
        true_labels (list or np.array): Ground-truth labels.
        pred_labels (list or np.array): Predicted labels.
        class_names (list): List of class names.
        save_path (str): Directory where results will be saved.
    """

    # Compute base metrics using the evaluation function
    metrics = evaluate_model(
        true_labels,
        pred_labels,
        class_names,
        title="Confusion Matrix"
    )

    # Add additional metrics
    metrics["precision_macro"] = precision_score(
        true_labels,
        pred_labels,
        average="macro",
        zero_division=0
    )
    metrics["recall_macro"] = recall_score(
        true_labels,
        pred_labels,
        average="macro",
        zero_division=0
    )
    metrics["cohen_kappa"] = cohen_kappa_score(true_labels, pred_labels)

    # Save metrics to JSON file
    metrics_path = os.path.join(save_path, "metrics.json")
    with open(metrics_path, "w") as f:
        json.dump(metrics, f, indent=4)

    print(f"Metrics saved to: {metrics_path}")

    # Generate and save the confusion matrix plot
    cm = confusion_matrix(true_labels, pred_labels)
    plt.figure(figsize=(6, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=class_names,
        yticklabels=class_names
    )
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")

    cm_path = os.path.join(save_path, "confusion_matrix.png")
    plt.savefig(cm_path, bbox_inches='tight')
    plt.close()

    print(f"Confusion matrix saved to: {cm_path}")

    return metrics


# Main

In [None]:
# General parameters
base_save_path = "Insert multimodal results saving path"
base_path = "Insert cross-validation splits path"

# NumPy files paths
audio_numpy_path = "Insert audio model NumPy outputs path"
text_numpy_path = "Insert text model NumPy outputs path"

num_splits = 10
class_names = ['no_satire', 'satire']

# Weights to test (must sum to 1)
weights = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
direct_pairs = [(w, 1 - w) for w in weights]
inverse_pairs = [(1 - w, w) for w in weights]

# Merge and remove duplicate pairs (e.g. (0.5, 0.5))
weight_pairs = list({pair for pair in direct_pairs + inverse_pairs})
weight_pairs = sorted(weight_pairs, key=lambda x: x[0])

# Loop over all cross-validation splits
for i in range(1, num_splits + 1):
    print(f"\nProcessing Cross {i}")

    # Paths to NumPy files
    split_path_audio = os.path.join(audio_numpy_path, f'Cross {i}', 'probabilities.npy')
    split_path_text = os.path.join(text_numpy_path, f'Cross {i}', 'probabilities.npy')
    true_labels_path = os.path.join(text_numpy_path, f'Cross {i}', 'true_labels.npy')

    # Load NumPy arrays
    audio_probabilities = np.load(split_path_audio)
    text_probabilities = np.load(split_path_text)
    true_labels = np.load(true_labels_path)

    if not isinstance(text_probabilities, np.ndarray):
        text_probabilities = np.array([t.cpu().numpy() for t in text_probabilities])

    # Evaluate single-modality models
    audio_pred_classes = np.argmax(audio_probabilities, axis=1)
    text_pred_classes = np.argmax(text_probabilities, axis=1)

    metrics_audio = {
        "accuracy": accuracy_score(true_labels, audio_pred_classes),
        "f1": f1_score(true_labels, audio_pred_classes, average='weighted'),
        "precision": precision_score(true_labels, audio_pred_classes, average='weighted'),
        "recall": recall_score(true_labels, audio_pred_classes, average='weighted'),
        "kappa": cohen_kappa_score(true_labels, audio_pred_classes)
    }

    metrics_text = {
        "accuracy": accuracy_score(true_labels, text_pred_classes),
        "f1": f1_score(true_labels, text_pred_classes, average='weighted'),
        "precision": precision_score(true_labels, text_pred_classes, average='weighted'),
        "recall": recall_score(true_labels, text_pred_classes, average='weighted'),
        "kappa": cohen_kappa_score(true_labels, text_pred_classes)
    }

    print(f"Audio model | Acc: {metrics_audio['accuracy']:.4f}, F1: {metrics_audio['f1']:.4f}")
    print(f"Text model  | Acc: {metrics_text['accuracy']:.4f}, F1: {metrics_text['f1']:.4f}")

    results = []

    # Test all weight combinations
    for alpha_audio, alpha_text in weight_pairs:
        final_pred = alpha_audio * audio_probabilities + alpha_text * text_probabilities
        final_pred_classes = np.argmax(final_pred, axis=1)

        acc = accuracy_score(true_labels, final_pred_classes)
        f1 = f1_score(true_labels, final_pred_classes, average='weighted')
        precision = precision_score(true_labels, final_pred_classes, average='weighted')
        recall = recall_score(true_labels, final_pred_classes, average='weighted')
        kappa = cohen_kappa_score(true_labels, final_pred_classes)

        results.append({
            'alpha_audio': alpha_audio,
            'alpha_text': alpha_text,
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall,
            'kappa': kappa
        })

    # Create results DataFrame
    df_results = pd.DataFrame(results)
    df_results = df_results.sort_values(by='f1', ascending=False).reset_index(drop=True)

    # Best-performing combination
    best_result = df_results.iloc[0]
    best_audio = best_result['alpha_audio']
    best_text = best_result['alpha_text']

    # Specific combination (audio 0.45, text 0.55)
    specific_result = df_results[
        (df_results['alpha_audio'] == 0.45) & (df_results['alpha_text'] == 0.55)
    ]

    if specific_result.empty:
        specific_result = pd.DataFrame([{
            'alpha_audio': 0.45,
            'alpha_text': 0.55,
            'accuracy': np.nan,
            'f1': np.nan,
            'precision': np.nan,
            'recall': np.nan,
            'kappa': np.nan
        }])

    # Save results
    save_path = os.path.join(base_save_path, f'Cross {i}')
    os.makedirs(save_path, exist_ok=True)

    df_results.to_csv(os.path.join(save_path, 'all_results.csv'), index=False)
    best_result.to_frame().T.to_csv(os.path.join(save_path, 'best_result.csv'), index=False)
    specific_result.to_csv(
        os.path.join(save_path, 'specific_result_0.45_0.55.csv'),
        index=False
    )

    print(f"Results saved to: {save_path}")
    print(
        f"Best fusion → Audio {best_audio:.2f}, Text {best_text:.2f} | "
        f"F1: {best_result['f1']:.4f}"
    )


# Best results

In [None]:
# Base path
base_save_path = "Insert multimodal results base path"
num_splits = 10

# List to store DataFrames from all folds
all_results = []

print("\nLoading results from all cross-validation folds")
for i in range(1, num_splits + 1):
    path = os.path.join(base_save_path, f'Cross {i}', 'all_results.csv')
    if os.path.exists(path):
        df = pd.read_csv(path)
        df['fold'] = i
        all_results.append(df)
        print(f"Loaded Cross {i} ({len(df)} rows)")
    else:
        print(f"Missing file for Cross {i}: {path}")

# Merge all results
if len(all_results) == 0:
    raise ValueError("No result files found. Please check the result paths.")

df_all = pd.concat(all_results, ignore_index=True)

# Group by weight combinations and compute mean and std
summary = (
    df_all.groupby(['alpha_audio', 'alpha_text'])
    .agg({
        'accuracy': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'kappa': ['mean', 'std']
    })
    .reset_index()
)

# Rename columns
summary.columns = [
    'alpha_audio', 'alpha_text',
    'accuracy_mean', 'accuracy_std',
    'f1_mean', 'f1_std',
    'precision_mean', 'precision_std',
    'recall_mean', 'recall_std',
    'kappa_mean', 'kappa_std'
]

# Sort by mean F1-score
summary = summary.sort_values(by='f1_mean', ascending=False).reset_index(drop=True)

# Print top 5 combinations
print("\nTop 5 weight combinations (averaged across all folds)")
print(summary.head(5))

# Save final summary
summary_path = os.path.join(base_save_path, 'summary_results.csv')
summary.to_csv(summary_path, index=False)

print(f"\nFinal summary file saved at:\n{summary_path}")


# No outliers best Results

In [None]:
# Base path for saving results
base_save_path = "Insert multimodal results base path"
num_splits = 10

# List to store DataFrames from all folds
all_results = []

print("\nLoading results from all cross-validation folds")
for i in range(1, num_splits + 1):
    path = os.path.join(base_save_path, f'Cross {i}', 'all_results.csv')
    if os.path.exists(path):
        df = pd.read_csv(path)
        df['fold'] = i
        all_results.append(df)
        print(f"Loaded Cross {i} ({len(df)} rows)")
    else:
        print(f"Missing file for Cross {i}: {path}")

# Merge all results
if len(all_results) == 0:
    raise ValueError("No result files found. Please check the result paths.")

df_all = pd.concat(all_results, ignore_index=True)

# Function to remove best and worst for a metric (default: F1)
# Computes mean and std for all 5 metrics
def remove_best_worst(group, metric='f1'):
    if len(group) > 2:
        group = group.sort_values(by=metric)
        group = group.iloc[1:-1]  # remove worst and best

    return pd.Series({
        'accuracy_mean': group['accuracy'].mean(),
        'accuracy_std': group['accuracy'].std(),

        'f1_mean': group['f1'].mean(),
        'f1_std': group['f1'].std(),

        'precision_mean': group['precision'].mean(),
        'precision_std': group['precision'].std(),

        'recall_mean': group['recall'].mean(),
        'recall_std': group['recall'].std(),

        'kappa_mean': group['kappa'].mean(),
        'kappa_std': group['kappa'].std()
    })

# Apply function for each weight combination
summary = (
    df_all.groupby(['alpha_audio', 'alpha_text'], group_keys=False)
    .apply(remove_best_worst)
    .reset_index()
)

# Sort by mean F1-score
summary = summary.sort_values(by='f1_mean', ascending=False).reset_index(drop=True)

# Print top 5 combinations
print("\nTop 5 weight combinations (average across all folds, excluding best & worst)")
print(summary.head(5))

# Save final filtered summary
summary_path = os.path.join(base_save_path, 'summary_results_filtered.csv')
summary.to_csv(summary_path, index=False)

print(f"\nFinal filtered summary file saved at:\n{summary_path}")


# Cross-validation evaluation with external dataset

In [None]:
# General parameters 
base_save_path = 'Insert path to save external results'

# NumPy files paths 
audio_numpy_path = 'Insert path to audio NumPy probabilities'
text_numpy_path = 'Insert path to text NumPy probabilities'

class_names = ['no_satire', 'satire']

# Weights to try (sum must be 1) 
weights = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
direct_pairs = [(w, 1 - w) for w in weights]
inverse_pairs = [(1 - w, w) for w in weights]

# Merge and remove duplicates like (0.5, 0.5)
weight_pairs = list({pair for pair in direct_pairs + inverse_pairs})
weight_pairs = sorted(weight_pairs, key=lambda x: x[0])

print(f"\n=== Processing external dataset ===")

# File paths
split_path_audio = os.path.join(audio_numpy_path, 'probabilities.npy')
split_path_text = os.path.join(text_numpy_path, 'probabilities.npy')
true_labels_path = os.path.join(text_numpy_path, 'true_labels.npy')

# Load NumPy files 
audio_probabilities = np.load(split_path_audio)
text_probabilities = np.load(split_path_text)
true_labels = np.load(true_labels_path)

if not isinstance(text_probabilities, np.ndarray):
    text_probabilities = np.array([t.cpu().numpy() for t in text_probabilities])

# Evaluate single models 
audio_pred_classes = np.argmax(audio_probabilities, axis=1)
text_pred_classes = np.argmax(text_probabilities, axis=1)

metrics_audio = {
    "accuracy": accuracy_score(true_labels, audio_pred_classes),
    "f1": f1_score(true_labels, audio_pred_classes, average='weighted'),
    "precision": precision_score(true_labels, audio_pred_classes, average='weighted'),
    "recall": recall_score(true_labels, audio_pred_classes, average='weighted'),
    "kappa": cohen_kappa_score(true_labels, audio_pred_classes)
}

metrics_text = {
    "accuracy": accuracy_score(true_labels, text_pred_classes),
    "f1": f1_score(true_labels, text_pred_classes, average='weighted'),
    "precision": precision_score(true_labels, text_pred_classes, average='weighted'),
    "recall": recall_score(true_labels, text_pred_classes, average='weighted'),
    "kappa": cohen_kappa_score(true_labels, text_pred_classes)
}

print(f"→ Audio Model  | Acc: {metrics_audio['accuracy']:.4f}, F1: {metrics_audio['f1']:.4f}")
print(f"→ Text  Model  | Acc: {metrics_text['accuracy']:.4f}, F1: {metrics_text['f1']:.4f}")

results = []

# Test all weight combinations 
for alpha_audio, alpha_text in weight_pairs:
    final_pred = alpha_audio * audio_probabilities + alpha_text * text_probabilities
    final_pred_classes = np.argmax(final_pred, axis=1)

    acc = accuracy_score(true_labels, final_pred_classes)
    f1 = f1_score(true_labels, final_pred_classes, average='weighted')
    precision = precision_score(true_labels, final_pred_classes, average='weighted')
    recall = recall_score(true_labels, final_pred_classes, average='weighted')
    kappa = cohen_kappa_score(true_labels, final_pred_classes)

    results.append({
        'alpha_audio': alpha_audio,
        'alpha_text': alpha_text,
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'kappa': kappa
    })

# Create DataFrame 
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by='f1', ascending=False).reset_index(drop=True)

# Best combination 
best_result = df_results.iloc[0]
best_audio = best_result['alpha_audio']
best_text = best_result['alpha_text']

# Specific combination (audio 0.45, text 0.55) 
specific_result = df_results[
    (df_results['alpha_audio'] == 0.45) & (df_results['alpha_text'] == 0.55)
]
if specific_result.empty:
    specific_result = pd.DataFrame([{
        'alpha_audio': 0.45,
        'alpha_text': 0.55,
        'accuracy': np.nan,
        'f1': np.nan,
        'precision': np.nan,
        'recall': np.nan,
        'kappa': np.nan
    }])

# Save results
save_path = os.path.join(base_save_path)
os.makedirs(save_path, exist_ok=True)

# All results
df_results.to_csv(os.path.join(save_path, 'all_results.csv'), index=False)

# Only best
best_result.to_frame().T.to_csv(os.path.join(save_path, 'best_result.csv'), index=False)

# Only specific combination (audio 0.45, text 0.55)
specific_result.to_csv(os.path.join(save_path, 'specific_result_0.45_0.55.csv'), index=False)

print(f"\nSaved in: {save_path}")
print(f"  → Best: Audio {best_audio:.2f}, Text {best_text:.2f} | F1: {best_result['f1']:.4f}")
