# Phase 4.9: The Final Experiment - A HuBERT Transformer Generalist

**Objective:** This notebook documents the successful execution of our end-to-end Speech Transformer experiment. After the initial attempt with Wav2Vec2 in the previous phase failed due to training instability, this experiment resolves the issue (by swapping to the similar but distinct **HuBERT** model) and successfully implements our two-stage curriculum learning strategy.

This represents the final comparison between the highly-optimized CNN-on-spectrograms approach and a state-of-the-art, audio-native Transformer model.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install transformers[torch] datasets librosa pandas seaborn matplotlib tqdm audiomentations

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ===================================================================
# CELL 2: DATA PREPARATION
# ===================================================================
import os
import random
from sklearn.model_selection import train_test_split
import pickle

# --- Configuration ---
RAVDESS_PATH = "/content/drive/MyDrive/ser_project/ravdess_data/"
CREMA_D_PATH = "/content/drive/MyDrive/ser_project/crema_d_data/AudioWAV/"
IEMOCAP_PATH = "/content/drive/MyDrive/ser_project/iemocap_data/IEMOCAP_full_release/"

# --- Mappings (6 core emotions) ---
unified_emotion_map = { "neutral": 0, "happy": 1, "sad": 2, "angry": 3, "fearful": 4, "disgust": 5 }
ravdess_map = { "01": "neutral", "03": "happy", "04": "sad", "05": "angry", "06": "fearful", "07": "disgust" }
crema_d_map = { "NEU": "neutral", "HAP": "happy", "SAD": "sad", "ANG": "angry", "FEA": "fearful", "DIS": "disgust" }
iemocap_map = { "neu": "neutral", "hap": "happy", "sad": "sad", "ang": "angry", "fea": "fearful", "exc": "happy" } # Map excited to happy

# --- Gather files and labels from all three datasets ---
all_files = []
all_labels_str = []
print("--- GATHERING AND COUNTING FILES ---")

# Process RAVDESS
ravdess_count = 0
for root, dirs, files in os.walk(RAVDESS_PATH):
    for f in files:
        if f.endswith('.wav'):
            try:
                code = f.split("-")[2]
                if code in ravdess_map:
                    all_files.append(os.path.join(root, f))
                    all_labels_str.append(ravdess_map[code])
                    ravdess_count += 1
            except IndexError:
                continue
print(f"Found {ravdess_count} relevant files in RAVDESS.")

# Process CREMA-D
crema_d_count = 0
if os.path.exists(CREMA_D_PATH):
    for f in os.listdir(CREMA_D_PATH):
        if f.endswith('.wav'):
            try:
                code = f.split("_")[2]
                if code in crema_d_map:
                    all_files.append(os.path.join(CREMA_D_PATH, f))
                    all_labels_str.append(crema_d_map[code])
                    crema_d_count += 1
            except IndexError:
                continue
print(f"Found {crema_d_count} relevant files in CREMA-D.")

# Process IEMOCAP
iemocap_count = 0
if os.path.exists(IEMOCAP_PATH):
    for session_folder in os.listdir(IEMOCAP_PATH):
        if session_folder.startswith("Session"):
            emo_path = os.path.join(IEMOCAP_PATH, session_folder, "dialog/EmoEvaluation/")
            wav_root = os.path.join(IEMOCAP_PATH, session_folder, "sentences/wav/")
            if os.path.isdir(emo_path) and os.path.isdir(wav_root):
                for txt_file in os.listdir(emo_path):
                    if txt_file.endswith('.txt'):
                        with open(os.path.join(emo_path, txt_file)) as f_ann:
                            for line in f_ann:
                                if line.startswith('['):
                                    parts = line.strip().split('\t')
                                    if len(parts) >= 3 and parts[2] in iemocap_map:
                                        wav_folder = parts[1].rsplit('_', 1)[0]
                                        wav_file = os.path.join(wav_root, wav_folder, f"{parts[1]}.wav")
                                        if os.path.exists(wav_file):
                                            all_files.append(wav_file)
                                            all_labels_str.append(iemocap_map[parts[2]])
                                            iemocap_count += 1
print(f"Found {iemocap_count} relevant files in IEMOCAP.")
print(f"\nTotal files found across all datasets: {len(all_files)}")

# --- Create final data splits ---
# 80% train, 10% validation, 10% test
train_val_files, test_files, train_val_labels_str, test_labels_str = train_test_split(
    all_files, all_labels_str, test_size=0.15, random_state=42, stratify=all_labels_str
)
train_files, val_files, train_labels_str, val_labels_str = train_test_split(
    train_val_files, train_val_labels_str, test_size=0.1, random_state=42, stratify=train_val_labels_str
)

print("\n--- DATA SPLITTING COMPLETE ---")
print(f"Training samples: {len(train_files)}")
print(f"Validation samples: {len(val_files)}")
print(f"Test samples: {len(test_files)}")

--- GATHERING AND COUNTING FILES ---
Found 1056 relevant files in RAVDESS.
Found 7442 relevant files in CREMA-D.
Found 3438 relevant files in IEMOCAP.

Total files found across all datasets: 11936

--- DATA SPLITTING COMPLETE ---
Training samples: 9130
Validation samples: 1015
Test samples: 1791


In [None]:
# ===================================================================
# CELL 3: HELPER DEFINITIONS
# ===================================================================
import torch
import librosa
from torch.utils.data import Dataset
from transformers import Wav2Vec2FeatureExtractor

# --- Initialize Feature Extractor (used by the collate function) ---
model_name = "facebook/wav2vec2-base-960h"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)

# --- Wav2Vec2 Dataset Class ---
class WavDataset(Dataset):
    def __init__(self, file_paths, labels):
        self.file_paths = file_paths
        self.labels = labels

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        # Load audio at the required 16kHz sample rate
        speech_array, sr = librosa.load(self.file_paths[idx], sr=16000)
        return speech_array, self.labels[idx]

# --- Collate Function to process batches ---
def collate_fn(batch):
    features, labels = zip(*batch)
    # The feature_extractor handles padding and tensor conversion
    processed = feature_extractor(list(features), sampling_rate=16000, padding=True, return_tensors="pt")
    return processed['input_values'], torch.tensor(labels, dtype=torch.long)

print("✅ Helper classes and functions are defined.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Helper classes and functions are defined.


## Part 1: Stage 1 - Training the "Acted Speech Expert"

The first stage of our curriculum learning strategy involves fine-tuning the pre-trained HuBERT model on the combined "acted" speech datasets (RAVDESS and CREMA-D). This training was successful, resolving the `NaN` loss issue from the previous attempt. The model achieved a peak validation accuracy of **80.25%**, creating a strong "Acted Speech Expert."

In [None]:
# ===================================================================
# CELL 4: STAGE 1 - Fine-tuning HuBERT on Acted Datasets
# ===================================================================
import torch, torch.nn as nn, librosa, os
from torch.utils.data import Dataset, DataLoader
from transformers import HubertForSequenceClassification, Wav2Vec2FeatureExtractor, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm

# --- Configuration ---
LEARNING_RATE = 3e-5; BATCH_SIZE = 8; EPOCHS = 15
CHECKPOINT_STAGE1_PATH = "/content/drive/MyDrive/ser_project/hubert_stage1_acted_best.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu");
unified_emotion_labels = ["neutral", "happy", "sad", "angry", "fearful", "disgust"]
emotion_to_idx = {e: i for i, e in enumerate(unified_emotion_labels)}

# --- Prepare Acted Datasets (RAVDESS + CREMA-D) ---
acted_train_files = [f for f in train_files if 'iemocap_data' not in f]
acted_val_files = [f for f in val_files if 'iemocap_data' not in f]
acted_train_labels = [emotion_to_idx[lbl] for i, lbl in enumerate(train_labels_str) if 'iemocap_data' not in train_files[i]]
acted_val_labels = [emotion_to_idx[lbl] for i, lbl in enumerate(val_labels_str) if 'iemocap_data' not in val_files[i]]

# The WavDataset and collate_fn must be available from Cell 3
train_dataset = WavDataset(acted_train_files, acted_train_labels)
val_dataset = WavDataset(acted_val_files, acted_val_labels)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
print(f"Starting Stage 1: Training HuBERT on {len(train_dataset)} acted samples...")

# --- Initialize HuBERT Model ---
model_name = "facebook/hubert-base-ls960"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = HubertForSequenceClassification.from_pretrained(model_name, num_labels=len(unified_emotion_labels))
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler()
num_training_steps = len(train_loader) * EPOCHS
num_warmup_steps = int(0.1 * num_training_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

# --- Stage 1 Training Loop ---
best_val_acc = 0.0
for epoch in range(EPOCHS):
    model.train(); running_loss = 0.0
    for inputs, labels in tqdm(train_loader, desc=f"Stage 1 - Epoch {epoch+1}/{EPOCHS}"):
        inputs, labels = inputs.to(device), labels.to(device)
        with torch.cuda.amp.autocast():
            outputs = model(inputs).logits
            loss = criterion(outputs, labels)
        optimizer.zero_grad(); scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer); scaler.update(); scheduler.step()
        running_loss += loss.item() * inputs.size(0)
    train_loss = running_loss / len(train_dataset)

    model.eval(); val_loss = 0.0; correct = 0; total = 0
    with torch.no_grad():
        for inputs, labels in tqdm(val_loader, desc=f"Stage 1 - Epoch {epoch+1}/{EPOCHS} [Val]"):
            inputs, labels = inputs.to(device), labels.to(device)
            with torch.cuda.amp.autocast():
                outputs = model(inputs).logits
                loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1); total += labels.size(0); correct += (predicted == labels).sum().item()
    val_accuracy = 100 * correct / total; val_loss /= len(val_dataset)
    print(f"Stage 1 - Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.2f}%")
    if val_accuracy > best_val_acc:
        best_val_acc = val_accuracy
        print(f"🎉 New best Stage 1 validation accuracy: {best_val_acc:.2f}%. Saving model...")
        torch.save({'model_state_dict': model.state_dict()}, CHECKPOINT_STAGE1_PATH)

Starting Stage 1: Training HuBERT on 6498 acted samples...


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler()


Stage 1 - Epoch 1/15:   0%|          | 0/813 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast():


Stage 1 - Epoch 1/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast():


Stage 1 - Epoch 1/15 | Train Loss: 1.7195 | Val Loss: 1.5232 | Val Acc: 37.02%
🎉 New best Stage 1 validation accuracy: 37.02%. Saving model...


Stage 1 - Epoch 2/15:   0%|          | 0/813 [00:00<?, ?it/s]

Stage 1 - Epoch 2/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

Stage 1 - Epoch 2/15 | Train Loss: 1.3997 | Val Loss: 1.2416 | Val Acc: 58.29%
🎉 New best Stage 1 validation accuracy: 58.29%. Saving model...


Stage 1 - Epoch 3/15:   0%|          | 0/813 [00:00<?, ?it/s]

Stage 1 - Epoch 3/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

Stage 1 - Epoch 3/15 | Train Loss: 0.9986 | Val Loss: 0.9268 | Val Acc: 67.96%
🎉 New best Stage 1 validation accuracy: 67.96%. Saving model...


Stage 1 - Epoch 4/15:   0%|          | 0/813 [00:00<?, ?it/s]

Stage 1 - Epoch 4/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

Stage 1 - Epoch 4/15 | Train Loss: 0.7780 | Val Loss: 0.8125 | Val Acc: 74.72%
🎉 New best Stage 1 validation accuracy: 74.72%. Saving model...


Stage 1 - Epoch 5/15:   0%|          | 0/813 [00:00<?, ?it/s]

Stage 1 - Epoch 5/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

Stage 1 - Epoch 5/15 | Train Loss: 0.5942 | Val Loss: 0.8044 | Val Acc: 77.07%
🎉 New best Stage 1 validation accuracy: 77.07%. Saving model...


Stage 1 - Epoch 6/15:   0%|          | 0/813 [00:00<?, ?it/s]

Stage 1 - Epoch 6/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

Stage 1 - Epoch 6/15 | Train Loss: 0.5344 | Val Loss: 0.9523 | Val Acc: 76.52%


Stage 1 - Epoch 7/15:   0%|          | 0/813 [00:00<?, ?it/s]

Stage 1 - Epoch 7/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

Stage 1 - Epoch 7/15 | Train Loss: 0.5455 | Val Loss: 1.0910 | Val Acc: 75.55%


Stage 1 - Epoch 8/15:   0%|          | 0/813 [00:00<?, ?it/s]

Stage 1 - Epoch 8/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

Stage 1 - Epoch 8/15 | Train Loss: 0.4607 | Val Loss: 0.9317 | Val Acc: 78.45%
🎉 New best Stage 1 validation accuracy: 78.45%. Saving model...


Stage 1 - Epoch 9/15:   0%|          | 0/813 [00:00<?, ?it/s]

Stage 1 - Epoch 9/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

Stage 1 - Epoch 9/15 | Train Loss: 0.4035 | Val Loss: 1.3089 | Val Acc: 74.59%


Stage 1 - Epoch 10/15:   0%|          | 0/813 [00:00<?, ?it/s]

Stage 1 - Epoch 10/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

Stage 1 - Epoch 10/15 | Train Loss: 0.4130 | Val Loss: 1.3773 | Val Acc: 76.52%


Stage 1 - Epoch 11/15:   0%|          | 0/813 [00:00<?, ?it/s]

Stage 1 - Epoch 11/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

Stage 1 - Epoch 11/15 | Train Loss: 0.4407 | Val Loss: 1.7798 | Val Acc: 77.76%


Stage 1 - Epoch 12/15:   0%|          | 0/813 [00:00<?, ?it/s]

Stage 1 - Epoch 12/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

Stage 1 - Epoch 12/15 | Train Loss: 0.4721 | Val Loss: 2.0428 | Val Acc: 78.18%


Stage 1 - Epoch 13/15:   0%|          | 0/813 [00:00<?, ?it/s]

Stage 1 - Epoch 13/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

Stage 1 - Epoch 13/15 | Train Loss: 0.5105 | Val Loss: 2.4050 | Val Acc: 76.80%


Stage 1 - Epoch 14/15:   0%|          | 0/813 [00:00<?, ?it/s]

Stage 1 - Epoch 14/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

Stage 1 - Epoch 14/15 | Train Loss: 0.3946 | Val Loss: 2.1749 | Val Acc: 80.25%
🎉 New best Stage 1 validation accuracy: 80.25%. Saving model...


Stage 1 - Epoch 15/15:   0%|          | 0/813 [00:00<?, ?it/s]

Stage 1 - Epoch 15/15 [Val]:   0%|          | 0/91 [00:00<?, ?it/s]

Stage 1 - Epoch 15/15 | Train Loss: 0.3581 | Val Loss: 2.4911 | Val Acc: 78.59%


## Part 2: Stage 2 - Adapting to Natural Speech

In the second stage, we take the expert model from Stage 1 and fine-tune it further, but this time only on the more challenging, natural speech of the IEMOCAP dataset. The goal is to adapt the model's knowledge to this new domain. The training was successful, reaching a peak validation accuracy of **78.01%** on the IEMOCAP validation set.

In [None]:
# ===================================================================
# CELL 5: STAGE 2 - ADAPTING HuBERT TO NATURAL SPEECH
# ===================================================================
import torch.nn as nn
from torch.utils.data import DataLoader
# --- KEY CHANGE: Use the HuBERT model architecture ---
from transformers import HubertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
import os

# --- Configuration ---
LEARNING_RATE = 1e-5; EPOCHS = 20
CHECKPOINT_STAGE1_PATH = "/content/drive/MyDrive/ser_project/hubert_stage1_acted_best.pth"
CHECKPOINT_STAGE2_PATH = "/content/drive/MyDrive/ser_project/hubert_stage2_final_best.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu");

# --- Prepare IEMOCAP-only data splits ---
iemocap_train_files = [f for f in train_files if 'iemocap_data' in f]
iemocap_val_files = [f for f in val_files if 'iemocap_data' in f]
iemocap_train_labels = [emotion_to_idx[lbl] for i, lbl in enumerate(train_labels_str) if 'iemocap_data' in train_files[i]]
iemocap_val_labels = [emotion_to_idx[lbl] for i, lbl in enumerate(val_labels_str) if 'iemocap_data' in val_files[i]]
train_dataset = WavDataset(iemocap_train_files, iemocap_train_labels)
val_dataset = WavDataset(iemocap_val_files, iemocap_val_labels)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
print(f"Starting Stage 2: Adapting HuBERT on {len(train_dataset)} natural samples...")

# --- Load the Stage 1 HuBERT Model ---
print("Loading Stage 1 HuBERT model (Acted Speech Expert)...")
# --- KEY CHANGE: Use the HuBERT model architecture ---
model = HubertForSequenceClassification.from_pretrained("facebook/hubert-base-ls960", num_labels=len(unified_emotion_labels))
stage1_checkpoint = torch.load(CHECKPOINT_STAGE1_PATH); model.load_state_dict(stage1_checkpoint['model_state_dict'])
model = model.to(device)

# --- Initialize a new Optimizer, Scheduler, and Scaler ---
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
num_training_steps = len(train_loader) * EPOCHS; num_warmup_steps = int(0.1 * num_training_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
scaler = torch.cuda.amp.GradScaler()

# --- Stage 2 Training Loop ---
best_val_acc = 0.0
for epoch in range(EPOCHS):
    model.train(); running_loss = 0.0
    for inputs, labels in tqdm(train_loader, desc=f"Stage 2 - Epoch {epoch+1}/{EPOCHS}"):
        inputs, labels = inputs.to(device), labels.to(device)
        with torch.cuda.amp.autocast():
            outputs = model(inputs).logits
            loss = criterion(outputs, labels)
        optimizer.zero_grad(); scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer); scaler.update(); scheduler.step()
        running_loss += loss.item() * inputs.size(0)
    train_loss = running_loss / len(train_dataset)

    # Validation
    model.eval(); val_loss = 0.0; correct = 0; total = 0
    with torch.no_grad():
        for inputs, labels in tqdm(val_loader, desc=f"Stage 2 - Epoch {epoch+1}/{EPOCHS} [Val]"):
            inputs, labels = inputs.to(device), labels.to(device)
            with torch.cuda.amp.autocast():
                outputs = model(inputs).logits
                loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1); total += labels.size(0); correct += (predicted == labels).sum().item()
    val_accuracy = 100 * correct / total; val_loss /= len(val_dataset)
    print(f"Stage 2 - Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.2f}%")

    if val_accuracy > best_val_acc:
        best_val_acc = val_accuracy
        print(f"🎉 New best Stage 2 validation accuracy: {best_val_acc:.2f}%. Saving final model...")
        torch.save({'model_state_dict': model.state_dict()}, CHECKPOINT_STAGE2_PATH)

Starting Stage 2: Adapting HuBERT on 2632 natural samples...
Loading Stage 1 HuBERT model (Acted Speech Expert)...


Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler()


Stage 2 - Epoch 1/20:   0%|          | 0/329 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast():


Stage 2 - Epoch 1/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast():


Stage 2 - Epoch 1/20 | Train Loss: 4.9722 | Val Loss: 1.5503 | Val Acc: 68.04%
🎉 New best Stage 2 validation accuracy: 68.04%. Saving final model...


Stage 2 - Epoch 2/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 2/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 2/20 | Train Loss: 1.1118 | Val Loss: 0.8460 | Val Acc: 71.82%
🎉 New best Stage 2 validation accuracy: 71.82%. Saving final model...


Stage 2 - Epoch 3/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 3/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 3/20 | Train Loss: 0.8875 | Val Loss: 0.7703 | Val Acc: 75.26%
🎉 New best Stage 2 validation accuracy: 75.26%. Saving final model...


Stage 2 - Epoch 4/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 4/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 4/20 | Train Loss: 0.7564 | Val Loss: 0.7940 | Val Acc: 73.88%


Stage 2 - Epoch 5/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 5/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 5/20 | Train Loss: 0.6670 | Val Loss: 0.6961 | Val Acc: 78.01%
🎉 New best Stage 2 validation accuracy: 78.01%. Saving final model...


Stage 2 - Epoch 6/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 6/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 6/20 | Train Loss: 0.6171 | Val Loss: 0.6838 | Val Acc: 78.01%


Stage 2 - Epoch 7/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 7/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 7/20 | Train Loss: 0.5439 | Val Loss: 0.7696 | Val Acc: 75.26%


Stage 2 - Epoch 8/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 8/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 8/20 | Train Loss: 0.4960 | Val Loss: 0.8399 | Val Acc: 73.88%


Stage 2 - Epoch 9/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 9/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 9/20 | Train Loss: 0.4579 | Val Loss: 0.7784 | Val Acc: 76.63%


Stage 2 - Epoch 10/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 10/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 10/20 | Train Loss: 0.4287 | Val Loss: 0.8670 | Val Acc: 76.98%


Stage 2 - Epoch 11/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 11/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 11/20 | Train Loss: 0.4213 | Val Loss: 0.8408 | Val Acc: 76.98%


Stage 2 - Epoch 12/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 12/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 12/20 | Train Loss: 0.3908 | Val Loss: 0.9084 | Val Acc: 77.66%


Stage 2 - Epoch 13/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 13/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 13/20 | Train Loss: 0.3882 | Val Loss: 1.0700 | Val Acc: 76.29%


Stage 2 - Epoch 14/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 14/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 14/20 | Train Loss: 0.3488 | Val Loss: 1.1534 | Val Acc: 76.98%


Stage 2 - Epoch 15/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 15/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 15/20 | Train Loss: 0.3664 | Val Loss: 1.2506 | Val Acc: 74.91%


Stage 2 - Epoch 16/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 16/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 16/20 | Train Loss: 0.3767 | Val Loss: 1.3898 | Val Acc: 75.26%


Stage 2 - Epoch 17/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 17/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 17/20 | Train Loss: 0.3568 | Val Loss: 1.4528 | Val Acc: 74.57%


Stage 2 - Epoch 18/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 18/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 18/20 | Train Loss: 0.3631 | Val Loss: 1.4969 | Val Acc: 74.57%


Stage 2 - Epoch 19/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 19/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 19/20 | Train Loss: 0.3645 | Val Loss: 1.5716 | Val Acc: 75.26%


Stage 2 - Epoch 20/20:   0%|          | 0/329 [00:00<?, ?it/s]

Stage 2 - Epoch 20/20 [Val]:   0%|          | 0/37 [00:00<?, ?it/s]

Stage 2 - Epoch 20/20 | Train Loss: 0.3266 | Val Loss: 1.6110 | Val Acc: 74.91%


## Part 3: The Final Verdict - Multi-Domain Evaluation

This is the grand finale. We take our final, two-stage-trained HuBERT model and evaluate its performance on the held-out test sets from all three domains: RAVDESS, CREMA-D, and IEMOCAP. The results will provide the definitive answer to our project's central research question: which architectural paradigm is superior for this cross-domain task?

In [None]:
# ===================================================================
# CELL 6: FINAL MULTI-DOMAIN EVALUATION (Definitive Version)
# ===================================================================
import torch
import torch.nn as nn
import os
import numpy as np
import librosa
from torch.utils.data import Dataset, DataLoader
from transformers import HubertForSequenceClassification, Wav2Vec2FeatureExtractor
from sklearn.metrics import accuracy_score, classification_report
from tqdm.notebook import tqdm

# --- Configuration ---
CHECKPOINT_BEST_PATH = "/content/drive/MyDrive/ser_project/hubert_stage2_final_best.pth"
BATCH_SIZE = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu"); print(f"Using device: {device}")

# --- Mappings ---
unified_emotion_labels = ["neutral", "happy", "sad", "angry", "fearful", "disgust"]
emotion_to_idx = {e: i for i, e in enumerate(unified_emotion_labels)}

# --- Load Final Model and Feature Extractor ---
print("\nLoading best fine-tuned Progressive HuBERT model...")
model_name = "facebook/hubert-base-ls960"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = HubertForSequenceClassification.from_pretrained(model_name, num_labels=len(unified_emotion_labels))
best_checkpoint = torch.load(CHECKPOINT_BEST_PATH, map_location=device)
model.load_state_dict(best_checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

# --- Recreate WavDataset and Collate Function ---
class WavDataset(Dataset):
    def __init__(self, file_paths, labels):
        self.file_paths = file_paths
        self.labels = labels
    def __len__(self):
        return len(self.file_paths)
    def __getitem__(self, idx):
        speech_array, sr = librosa.load(self.file_paths[idx], sr=16000)
        return speech_array, self.labels[idx]

def collate_fn(batch):
    features, labels = zip(*batch)
    processed = feature_extractor(list(features), sampling_rate=16000, padding=True, return_tensors="pt")
    return processed['input_values'], torch.tensor(labels, dtype=torch.long)

# --- Use Test Set variables from Cell 2 ---
print(f"Loaded {len(test_files)} total files for testing.")
test_labels = [emotion_to_idx[lbl] for lbl in test_labels_str]

# --- Filter the test set for each dataset ---
ravdess_test_files = [f for f in test_files if 'ravdess_data' in f]
crema_d_test_files = [f for f in test_files if 'crema_d_data' in f]
iemocap_test_files = [f for f in test_files if 'iemocap_data' in f]

ravdess_test_labels = [l for i, l in enumerate(test_labels) if 'ravdess_data' in test_files[i]]
crema_d_test_labels = [l for i, l in enumerate(test_labels) if 'crema_d_data' in test_files[i]]
iemocap_test_labels = [l for i, l in enumerate(test_labels) if 'iemocap_data' in test_files[i]]

# --- Evaluation Function (Upgraded to be more robust) ---
def evaluate(files, labels, name):
    if not files:
        print(f"\nSkipping evaluation for {name}: No test files found.")
        return

    dataset = WavDataset(files, labels)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, num_workers=2)
    all_preds, all_true = [], []
    with torch.no_grad():
        for inputs, labs in tqdm(loader, desc=f"Evaluating on {name}"):
            inputs, labs = inputs.to(device), labs.to(device)
            outputs = model(inputs).logits
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_true.extend(labs.cpu().numpy())

    accuracy = accuracy_score(all_true, all_preds)
    print(f"\n>>> Accuracy on {name}: {accuracy * 100:.2f}%")
    print(f"Classification Report for {name}:")

    # --- THIS IS THE ROBUST FIX ---
    # Get the unique labels that are actually present in this test set
    present_labels = np.unique(np.concatenate((all_true, all_preds)))
    # Get the corresponding names for those labels
    target_names_present = [unified_emotion_labels[i] for i in present_labels]

    print(classification_report(all_true, all_preds, labels=present_labels, target_names=target_names_present, zero_division=0))

# --- Run the Final Evaluations ---
print("\n--- FINAL MULTI-DOMAIN EVALUATION ---")
evaluate(ravdess_test_files, ravdess_test_labels, "RAVDESS Test Set")
evaluate(crema_d_test_files, crema_d_test_labels, "CREMA-D Test Set")
evaluate(iemocap_test_files, iemocap_test_labels, "IEMOCAP Test Set")

Using device: cuda

Loading best fine-tuned Progressive HuBERT model...


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded 1791 total files for testing.

--- FINAL MULTI-DOMAIN EVALUATION ---


Evaluating on RAVDESS Test Set:   0%|          | 0/17 [00:00<?, ?it/s]


>>> Accuracy on RAVDESS Test Set: 45.86%
Classification Report for RAVDESS Test Set:
              precision    recall  f1-score   support

     neutral       0.48      1.00      0.65        13
       happy       0.60      0.22      0.32        27
         sad       0.86      0.24      0.38        25
       angry       0.28      1.00      0.44        21
     fearful       1.00      0.33      0.50        24
     disgust       1.00      0.30      0.47        23

    accuracy                           0.46       133
   macro avg       0.70      0.52      0.46       133
weighted avg       0.73      0.46      0.44       133



Evaluating on CREMA-D Test Set:   0%|          | 0/143 [00:00<?, ?it/s]


>>> Accuracy on CREMA-D Test Set: 44.71%
Classification Report for CREMA-D Test Set:
              precision    recall  f1-score   support

     neutral       0.35      0.96      0.51       167
       happy       0.48      0.07      0.12       199
         sad       0.49      0.41      0.45       205
       angry       0.44      0.95      0.60       184
     fearful       0.89      0.16      0.27       192
     disgust       0.96      0.24      0.38       196

    accuracy                           0.45      1143
   macro avg       0.60      0.47      0.39      1143
weighted avg       0.61      0.45      0.38      1143



Evaluating on IEMOCAP Test Set:   0%|          | 0/65 [00:00<?, ?it/s]


>>> Accuracy on IEMOCAP Test Set: 66.60%
Classification Report for IEMOCAP Test Set:
              precision    recall  f1-score   support

     neutral       0.56      0.84      0.67       164
       happy       0.79      0.49      0.60       134
         sad       0.79      0.57      0.66        97
       angry       0.75      0.75      0.75       113
     fearful       0.00      0.00      0.00         7

    accuracy                           0.67       515
   macro avg       0.58      0.53      0.54       515
weighted avg       0.70      0.67      0.66       515



In [None]:
!pip freeze

absl-py==1.4.0
absolufy-imports==0.3.1
accelerate==1.10.1
aiofiles==24.1.0
aiohappyeyeballs==2.6.1
aiohttp==3.12.15
aiosignal==1.4.0
alabaster==1.0.0
albucore==0.0.24
albumentations==2.0.8
ale-py==0.11.2
alembic==1.16.5
altair==5.5.0
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.10.0
anywidget==0.9.18
argon2-cffi==25.1.0
argon2-cffi-bindings==25.1.0
array_record==0.8.1
arrow==1.3.0
arviz==0.22.0
astropy==7.1.0
astropy-iers-data==0.2025.9.1.0.42.11
astunparse==1.6.3
atpublic==5.1
attrs==25.3.0
audioread==3.0.1
Authlib==1.6.3
autograd==1.8.0
babel==2.17.0
backcall==0.2.0
beartype==0.21.0
beautifulsoup4==4.13.5
betterproto==2.0.0b6
bigframes==2.17.0
bigquery-magics==0.10.3
bleach==6.2.0
blinker==1.9.0
blis==1.3.0
blobfile==3.0.0
blosc2==3.7.2
bokeh==3.7.3
Bottleneck==1.4.2
bqplot==0.12.45
branca==0.8.1
Brotli==1.1.0
build==1.3.0
CacheControl==0.14.3
cachetools==5.5.2
catalogue==2.0.10
certifi==2025.8.3
cffi==1.17.1
chardet==5.2.0
charset-normalizer==3.4.3
chex==0.1.90
clar