In [None]:
!pip install torchcodec




In [None]:
import kagglehub
import os, math, random, argparse, itertools
from dataclasses import dataclass
from typing import List, Tuple, Dict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from torch.utils.data import Dataset, DataLoader

from transformers import WavLMModel, WavLMConfig, AutoConfig, AutoModel
import numpy as np
from sklearn.metrics import (
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
)
from collections import defaultdict
from tqdm import tqdm
import pandas as pd
import glob

In [None]:

# Download latest version
path = kagglehub.dataset_download("akaiinu/crema-d")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'crema-d' dataset.
Path to dataset files: /kaggle/input/crema-d


In [None]:
audio_path = "/root/.cache/kagglehub/datasets/akaiinu/crema-d/versions/1/cremad/AudioWAV"
files = glob.glob(os.path.join(audio_path, "*.wav"))

In [None]:
# =========================
# Utils
# =========================
SEED = 1337
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# Ánh xạ nhãn cảm xúc sang số nguyên
EMO_MAP = {"ANG":0, "DIS":1, "FEA":2, "HAP":3, "NEU":4, "SAD":5}
# Ánh xạ ngược từ số nguyên sang nhãn cảm xúc
IDX2EMO = {v:k for k,v in EMO_MAP.items()}

def set_deterministic():
    # Đặt seed để kết quả có thể tái lập
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def parse_label_from_filename(fname: str) -> int:
    # Phân tích tên file để lấy nhãn cảm xúc
    # Định dạng CREMA-D: ID_Sentence_Emotion_Intensity.wav
    # Ví dụ: 1001_DFA_ANG_XX.wav
    parts = os.path.basename(fname).split("_")
    emo = parts[2]
    return EMO_MAP[emo] # Trả về số nguyên tương ứng với cảm xúc

def compute_UA_WA(y_true, y_pred, n_classes=6):
    # Tính Unweighted Accuracy (UA) và Weighted Accuracy (WA)
    # UA: trung bình recall trên các lớp; WA: độ chính xác tổng thể
    cm = confusion_matrix(y_true, y_pred, labels=list(range(n_classes)))
    per_class_acc = []
    for c in range(n_classes):
        denom = cm[c].sum()
        per_class_acc.append((cm[c,c] / denom) if denom>0 else 0.0)
    UA = float(np.mean(per_class_acc))
    WA = float(np.trace(cm)) / float(np.sum(cm))
    return UA, WA, cm


In [None]:

# =========================
# Dataset
# =========================
class CREMADataset(Dataset):
    # Lớp Dataset cho bộ dữ liệu CREMA-D
    def __init__(self, data_root: str, split: str, sr: int = 16000, val_ratio: float = 0.1, test_ratio: float = 0.1):
        assert split in ["train","val","test"]
        self.sr = sr # Tốc độ lấy mẫu mục tiêu
        # Lấy tất cả các file .wav trong thư mục data_root
        all_files = [os.path.join(data_root, f) for f in os.listdir(data_root) if f.lower().endswith(".wav")]
        all_files = sorted(all_files)
        # Chia tập dữ liệu theo cảm xúc để đảm bảo phân bố đều (Stratified split)
        emo_buckets = defaultdict(list)
        for f in all_files:
            try:
                lab = parse_label_from_filename(f)
                emo_buckets[lab].append(f)
            except Exception:
                continue

        train_files, val_files, test_files = [], [], []
        rng = random.Random(SEED)
        for lab, lst in emo_buckets.items():
            rng.shuffle(lst) # Xáo trộn file trong mỗi bucket cảm xúc
            n = len(lst)
            n_test = int(round(test_ratio*n))
            n_val  = int(round(val_ratio*n))
            # Chia file vào các tập train/val/test
            test_files += lst[:n_test]
            val_files  += lst[n_test:n_test+n_val]
            train_files += lst[n_test+n_val:]

        # Gán danh sách file cho tập dữ liệu hiện tại (train, val, hoặc test)
        if split=="train": self.files = train_files
        elif split=="val": self.files = val_files
        else: self.files = test_files

        # Bộ resampler nếu tốc độ lấy mẫu gốc khác mục tiêu
        # self.resampler = torchaudio.transforms.Resample(orig_freq=44100, new_freq=sr)

    def __len__(self):
        # Trả về số lượng file trong tập dữ liệu
        return len(self.files)

    def __getitem__(self, idx):
        # Lấy mẫu dữ liệu theo index
        path = self.files[idx]
        y = parse_label_from_filename(path) # Lấy nhãn cảm xúc
        wav, sr = torchaudio.load(path) # Load file audio
        wav = wav.mean(dim=0, keepdim=True)  # Chuyển sang mono
        if sr != self.sr:
            # CREMA-D gốc 44.1kHz, resample nếu cần
            wav = torchaudio.functional.resample(wav, sr, self.sr)
        wav = wav.squeeze(0)  # Bỏ chiều kênh nếu là mono [T]
        # Chuẩn hóa năng lượng nhẹ về [-1, 1]
        wav = wav / (wav.abs().max() + 1e-9)
        return wav, y, path # Trả về waveform, nhãn và đường dẫn file

def collate_batch(batch):
    # Hàm collate để xử lý batch dữ liệu cho DataLoader
    # Pad các waveform trong batch về cùng độ dài lớn nhất
    wavs, labels, paths = zip(*batch)
    lengths = [len(w) for w in wavs] # Độ dài gốc của từng waveform
    maxlen = max(lengths) # Độ dài lớn nhất trong batch
    padded = []
    for w in wavs:
        if len(w) < maxlen:
            # Pad waveform nếu ngắn hơn maxlen
            w = F.pad(w, (0, maxlen - len(w)))
        padded.append(w.unsqueeze(0)) # Thêm chiều batch
    audio = torch.cat(padded, dim=0)  # Nối các waveform đã pad thành một tensor [B, T]
    labels = torch.tensor(labels, dtype=torch.long) # Chuyển nhãn thành tensor
    return audio, labels, torch.tensor(lengths, dtype=torch.long), paths # Trả về batch


In [None]:

# =========================
# SL-WDEE Emotional Encoder
# =========================
class ContextualTransform(nn.Module):
    """
    Biến đổi đặc trưng frame-by-frame thành đặc trưng có ngữ cảnh (từ t-L đến t+R).
    Input:  [B, T, D] (Batch, Time, Dimension)
    Output: [B, T, D*(L+R+1)]
    """
    def __init__(self, left=5, right=5):
        super().__init__()
        self.l = left # Số frame ngữ cảnh bên trái
        self.r = right # Số frame ngữ cảnh bên phải

    def forward(self, x):
        # x: [B, T, D]
        B, T, D = x.shape
        pads = (self.l, self.r)  # Số lượng padding theo chiều thời gian T
        # Pad tensor theo chiều thời gian bằng cách lặp lại giá trị ở biên (mode="replicate")
        x_pad = F.pad(x.transpose(1,2), pad=(self.l, self.r), mode="replicate").transpose(1,2)  # [B, T+L+R, D]
        ctx_list = []
        # Trích xuất các dải frame ngữ cảnh
        for offset in range(-self.l, self.r+1):
            start = (self.l + offset)
            end = start + T
            ctx_list.append(x_pad[:, start:end, :])  # [B, T, D]
        # Nối các dải ngữ cảnh lại theo chiều cuối cùng (dimension)
        ctx = torch.cat(ctx_list, dim=-1)  # [B, T, D*(L+R+1)]
        return ctx

class EmotionalEncoder(nn.Module):
    """
    Mô-đun Emotional Encoder theo bài báo SL-WDEE:
    Layer Normalization -> Contextual Transform -> Linear -> Sigmoid -> Linear -> Sigmoid.
    Trả về đặc trưng theo từng frame (SL-WDEE_t) và đặc trưng mức utterance (pooling).
    """
    def __init__(self, in_dim, hidden_dim=512, out_dim=256, left=5, right=5):
        super().__init__()
        self.norm = nn.LayerNorm(in_dim) # Layer Normalization
        self.ctx = ContextualTransform(left, right) # Biến đổi ngữ cảnh
        self.fc1 = nn.Linear(in_dim*(left+right+1), hidden_dim) # Lớp Linear thứ nhất
        self.fc2 = nn.Linear(hidden_dim, out_dim) # Lớp Linear thứ hai

    def forward(self, x):
        # x: [B, T, D] (đặc trưng từ WavLM)
        x = self.norm(x) # Áp dụng Layer Normalization
        x = self.ctx(x)                 # Áp dụng biến đổi ngữ cảnh [B, T, D*11]
        x = torch.sigmoid(self.fc1(x))  # Lớp Linear 1 + Sigmoid [B, T, H]
        x = torch.sigmoid(self.fc2(x))  # Lớp Linear 2 + Sigmoid [B, T, out]
        # Pooling mức Utterance (trung bình cộng qua chiều thời gian)
        wdee_utt = x.mean(dim=1)       # [B, out]
        return x, wdee_utt  # Trả về đặc trưng frame-level và utterance-level


In [None]:
# =========================
# Classifier (utterance level)
# =========================
class MLPClassifier(nn.Module):
    # Bộ phân loại MLP đơn giản cho đặc trưng mức utterance
    def __init__(self, in_dim=256, num_classes=6):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 256),     # Lớp Linear 1
            nn.ReLU(inplace=True),      # Hàm kích hoạt ReLU
            nn.Dropout(0.2),            # Dropout để chống overfitting
            nn.Linear(256, 128),        # Lớp Linear 2
            nn.ReLU(inplace=True),      # Hàm kích hoạt ReLU
            nn.Dropout(0.2),            # Dropout
            nn.Linear(128, num_classes) # Lớp Linear cuối cùng ra số lớp
        )
    def forward(self, x):
        # x: [B, in_dim] (đặc trưng mức utterance)
        return self.net(x) # Trả về logits


In [None]:
# =========================
# SER Model (WavLM -> SL-WDEE -> Classifier)
# =========================
class SER_WavLM_WDEE(nn.Module):
    # Mô hình nhận dạng cảm xúc giọng nói kết hợp WavLM và SL-WDEE
    def __init__(self, wavlm_name="microsoft/wavlm-base-plus",
                 wdee_hidden=512, wdee_out=256, freeze_wavlm=True):
        super().__init__()
        # Tải mô hình WavLM từ Hugging Face Transformers
        self.wavlm = AutoModel.from_pretrained(wavlm_name)
        hidden = self.wavlm.config.hidden_size # Kích thước đặc trưng đầu ra của WavLM (768)
        # Khởi tạo Emotional Encoder
        self.encoder = EmotionalEncoder(in_dim=hidden, hidden_dim=wdee_hidden, out_dim=wdee_out)
        # Khởi tạo bộ phân loại MLP
        self.cls = MLPClassifier(in_dim=wdee_out, num_classes=6)
        # Tùy chọn đóng băng (freeze) các tham số của WavLM
        if freeze_wavlm:
            for p in self.wavlm.parameters():
                p.requires_grad = False


    @torch.no_grad()
    def _extract_wavlm_features(self, wav_batch, lengths):
        # Trích xuất đặc trưng từ WavLM (không tính gradient)
        # wav_batch: [B, T] float32, đã chuẩn hoá [-1,1]
        B, T = wav_batch.shape
        lengths = lengths.to(wav_batch.device)    #to device
        # Tạo attention_mask: 1 cho phần có dữ liệu gốc, 0 cho phần padding
        attention_mask = (torch.arange(T, device=wav_batch.device)[None, :] < lengths[:, None]).long()
        # Chạy WavLM
        outputs = self.wavlm(input_values=wav_batch, attention_mask=attention_mask, output_hidden_states=False)
        feats = outputs.last_hidden_state  # Lấy đặc trưng từ lớp cuối cùng [B, T', D] (T' là số frame sau WavLM)
        return feats


    def forward(self, wav_batch, lengths):
          # Chạy forward pass của toàn bộ mô hình
          feats = self._extract_wavlm_features(wav_batch, lengths)   # Trích xuất đặc trưng WavLM [B,T',D]
          per_frame_wdee, utt_wdee = self.encoder(feats)              # Chạy qua Emotional Encoder [B,T',E], [B,E]
          logits = self.cls(utt_wdee)                                 # Chạy qua bộ phân loại [B,6]
          return logits, utt_wdee # Trả về logits dự đoán cảm xúc và đặc trưng mức utterance WDEE


In [None]:
# =========================
# Training / Evaluation
# =========================
@dataclass
class TrainConfig:
    # Lớp cấu hình cho quá trình huấn luyện
    data_root: str
    batch_size: int = 8
    epochs: int = 20
    lr: float = 2e-4
    freeze_wavlm: bool = True
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    wavlm_name: str = "microsoft/wavlm-base-plus"
    wdee_hidden: int = 512
    wdee_out: int = 256
    sr: int = 16000
    val_ratio: float = 0.1
    test_ratio: float = 0.1
    grad_clip: float = 1.0
    mixed_precision: bool = True # Sử dụng mixed precision training để tăng tốc (nếu có GPU)

def train_one_epoch(model, loader, optim, scaler, cfg):
    # Hàm huấn luyện một epoch
    model.train() # Đặt model ở chế độ train
    total_loss, total, correct = 0.0, 0, 0
    for wav, y, lengths, _ in tqdm(loader, desc="Train", leave=False):
        wav, y = wav.to(cfg.device), y.to(cfg.device) # Chuyển dữ liệu lên device
        optim.zero_grad(set_to_none=True) # Reset gradient
        if cfg.mixed_precision:
            # Sử dụng mixed precision nếu bật và có GPU
            with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
                logits, _ = model(wav, lengths) # Forward pass
                loss = F.cross_entropy(logits, y) # Tính loss (Cross-Entropy)
            scaler.scale(loss).backward() # Scale loss và backward
            scaler.unscale_(optim) # Unscale gradient trước khi clip
            if cfg.grad_clip is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip) # Clip gradient
            scaler.step(optim) # Cập nhật tham số
            scaler.update() # Cập nhật scaler
        else:
            # Huấn luyện thông thường (full precision)
            logits, _ = model(wav, lengths)
            loss = F.cross_entropy(logits, y)
            loss.backward()
            if cfg.grad_clip is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip)
            optim.step()

        total_loss += float(loss.item()) * len(y) # Cộng dồn loss
        pred = logits.argmax(dim=1) # Lấy dự đoán
        correct += int((pred==y).sum().item()) # Đếm số lượng dự đoán đúng
        total += len(y) # Tổng số mẫu trong batch
    return total_loss/total, correct/total # Trả về loss và accuracy trung bình epoch

@torch.no_grad()
def evaluate(model, loader, cfg):
    # Hàm đánh giá mô hình (không tính gradient)
    model.eval()
    total_loss, total, correct = 0.0, 0, 0
    y_true, y_pred = [], []

    for wav, y, lengths, _ in tqdm(loader, desc="Eval", leave=False):
        wav, y = wav.to(cfg.device), y.to(cfg.device)
        logits, _ = model(wav, lengths)
        loss = F.cross_entropy(logits, y)
        total_loss += float(loss.item()) * len(y)

        pred = logits.argmax(dim=1)
        correct += int((pred == y).sum().item())
        total += len(y)

        y_true += y.detach().cpu().tolist()
        y_pred += pred.detach().cpu().tolist()

    # UA, WA, Confusion Matrix
    UA, WA, cm = compute_UA_WA(y_true, y_pred, n_classes=6)

    # Precision / Recall / F1
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    weighted_f1 = f1_score(y_true, y_pred, average="weighted")

    macro_precision = precision_score(y_true, y_pred, average="macro")
    weighted_precision = precision_score(y_true, y_pred, average="weighted")

    macro_recall = recall_score(y_true, y_pred, average="macro")
    weighted_recall = recall_score(y_true, y_pred, average="weighted")

    # (tuỳ chọn) in luôn classification report dạng bảng
    cls_report = classification_report(
        y_true, y_pred,
        target_names=[IDX2EMO[i] for i in range(6)],
        digits=4
    )

    # Có thể in ra trong lúc - (debug)
    # print(cls_report)

    # Trả thêm các metric mới
    return (
        total_loss / total,
        correct / total,
        UA,
        WA,
        cm,
        macro_f1,
        weighted_f1,
        macro_precision,
        weighted_precision,
        macro_recall,
        weighted_recall,
        cls_report,
    )

In [None]:
"""Origin """
def main():
    # Hàm chính để chạy huấn luyện và đánh giá
    set_deterministic() # Đặt seed
    cfg = TrainConfig(
        data_root="/kaggle/input/crema-d/cremad/AudioWAV", # Đường dẫn đến thư mục audio
        epochs=20,
        batch_size=8,
        lr=2e-4,
        freeze_wavlm=True # Đóng băng WavLM ở giai đoạn đầu
    )
    print(cfg)

    # Khởi tạo Dataset cho train, val, test
    train_ds = CREMADataset(cfg.data_root, "train", sr=cfg.sr, val_ratio=cfg.val_ratio, test_ratio=cfg.test_ratio)
    val_ds   = CREMADataset(cfg.data_root, "val",   sr=cfg.sr, val_ratio=cfg.val_ratio, test_ratio=cfg.test_ratio)
    test_ds  = CREMADataset(cfg.data_root, "test",  sr=cfg.sr, val_ratio=cfg.val_ratio, test_ratio=cfg.test_ratio)

    # Khởi tạo DataLoader
    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=4, collate_fn=collate_batch, pin_memory=True)
    val_loader   = DataLoader(val_ds,   batch_size=cfg.batch_size, shuffle=False, num_workers=4, collate_fn=collate_batch, pin_memory=True)
    test_loader  = DataLoader(test_ds,  batch_size=cfg.batch_size, shuffle=False, num_workers=4, collate_fn=collate_batch, pin_memory=True)

    # Khởi tạo mô hình
    model = SER_WavLM_WDEE(
        wavlm_name=cfg.wavlm_name,
        wdee_hidden=cfg.wdee_hidden,
        wdee_out=cfg.wdee_out,
        freeze_wavlm=cfg.freeze_wavlm
    ).to(cfg.device) # Chuyển model lên device

    # Khởi tạo optimizer và scaler (cho mixed precision)
    optim = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.lr)
    scaler = torch.cuda.amp.GradScaler(enabled=(cfg.device.startswith("cuda") and cfg.mixed_precision))

    best_val_ua, best_state = -1.0, None
    # Vòng lặp huấn luyện qua các epoch
    for epoch in range(1, cfg.epochs+1):
        tr_loss, tr_acc = train_one_epoch(model, train_loader, optim, scaler, cfg) # Huấn luyện 1 epoch
        (
          val_loss,
          val_acc,
          val_UA,
          val_WA,
          _,
          val_f1_macro,
          val_f1_weighted,
          val_prec_macro,
          val_prec_weighted,
          val_rec_macro,
          val_rec_weighted,
          _
      ) = evaluate(model, val_loader, cfg)

        print(
            f"[Epoch {epoch:02d}] "
            f"Train loss {tr_loss:.4f} acc {tr_acc:.4f} | "
            f"Val loss {val_loss:.4f} acc {val_acc:.4f} "
            f"UA {val_UA:.4f} WA {val_WA:.4f} "
            f"F1(macro) {val_f1_macro:.4f} F1(weighted) {val_f1_weighted:.4f}"
        )
        # Lưu trạng thái mô hình nếu đạt UA tốt nhất trên tập validation
        if val_UA > best_val_ua:
            best_val_ua = val_UA
            best_state = {k:v.cpu() for k,v in model.state_dict().items()}

    # Load lại trạng thái mô hình tốt nhất từ validation
    if best_state is not None:
        model.load_state_dict({k:v.to(cfg.device) for k,v in best_state.items()})

    # Đánh giá cuối cùng trên tập test
    (
          te_loss,
          te_acc,
          te_UA,
          te_WA,
          _,
          te_f1_macro,
          te_f1_weighted,
          te_prec_macro,
          te_prec_weighted,
          te_rec_macro,
          te_rec_weighted,
          _
      ) = evaluate(model, test_loader, cfg)
    print("\n=== TEST RESULTS (CREMA-D) ===")
    print(f"Train loss {te_loss:.4f} acc {te_acc:.4f} | "
            f"Val loss {val_loss:.4f} acc {val_acc:.4f} "
            f"UA {te_UA:.4f} WA {te_WA:.4f} "
            f"F1(macro) {te_f1_macro:.4f} F1(weighted) {te_f1_weighted:.4f}")
    # # In ra confusion matrix có nhãn
    # df_cm = pd.DataFrame(cm, index=[IDX2EMO[i] for i in range(6)], columns=[IDX2EMO[i] for i in range(6)])
    # print("\nConfusion matrix:")
    # print(df_cm)



In [None]:
if __name__ == "__main__":
    main() # Chạy hàm main khi script được thực thi trực tiếp

TrainConfig(data_root='/kaggle/input/crema-d/cremad/AudioWAV', batch_size=8, epochs=20, lr=0.0002, freeze_wavlm=True, device='cuda', wavlm_name='microsoft/wavlm-base-plus', wdee_hidden=512, wdee_out=256, sr=16000, val_ratio=0.1, test_ratio=0.1, grad_clip=1.0, mixed_precision=True)


  scaler = torch.cuda.amp.GradScaler(enabled=(cfg.device.startswith("cuda") and cfg.mixed_precision))
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[Epoch 01] Train loss 1.5683 acc 0.3371 | Val loss 1.5021 acc 0.3441 UA 0.3418 WA 0.3441 F1(macro) 0.2764 F1(weighted) 0.2784


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[Epoch 02] Train loss 1.4128 acc 0.4167 | Val loss 1.4205 acc 0.4140 UA 0.4111 WA 0.4140 F1(macro) 0.3577 F1(weighted) 0.3599


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 03] Train loss 1.2804 acc 0.4899 | Val loss 1.2410 acc 0.5175 UA 0.5282 WA 0.5175 F1(macro) 0.4780 F1(weighted) 0.4756


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 04] Train loss 1.1326 acc 0.5590 | Val loss 1.0939 acc 0.5726 UA 0.5783 WA 0.5726 F1(macro) 0.5713 F1(weighted) 0.5672


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 05] Train loss 1.0730 acc 0.5858 | Val loss 1.0442 acc 0.5524 UA 0.5569 WA 0.5524 F1(macro) 0.5600 F1(weighted) 0.5553


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 06] Train loss 1.0120 acc 0.6087 | Val loss 1.1542 acc 0.5914 UA 0.5993 WA 0.5914 F1(macro) 0.5666 F1(weighted) 0.5626


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 07] Train loss 0.9880 acc 0.6229 | Val loss 0.9671 acc 0.6196 UA 0.6254 WA 0.6196 F1(macro) 0.6135 F1(weighted) 0.6099


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 08] Train loss 0.9552 acc 0.6340 | Val loss 0.9515 acc 0.6290 UA 0.6337 WA 0.6290 F1(macro) 0.6204 F1(weighted) 0.6169


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 09] Train loss 0.9491 acc 0.6460 | Val loss 0.9515 acc 0.6358 UA 0.6409 WA 0.6358 F1(macro) 0.6254 F1(weighted) 0.6217


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 10] Train loss 0.9108 acc 0.6612 | Val loss 1.0189 acc 0.6223 UA 0.6267 WA 0.6223 F1(macro) 0.6104 F1(weighted) 0.6066


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 11] Train loss 0.9102 acc 0.6560 | Val loss 1.0751 acc 0.6183 UA 0.6256 WA 0.6183 F1(macro) 0.5943 F1(weighted) 0.5890


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 12] Train loss 0.8955 acc 0.6648 | Val loss 0.9405 acc 0.6425 UA 0.6470 WA 0.6425 F1(macro) 0.6457 F1(weighted) 0.6426


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 13] Train loss 0.8727 acc 0.6725 | Val loss 0.9518 acc 0.6519 UA 0.6588 WA 0.6519 F1(macro) 0.6424 F1(weighted) 0.6395


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 14] Train loss 0.8534 acc 0.6822 | Val loss 0.9264 acc 0.6559 UA 0.6632 WA 0.6559 F1(macro) 0.6449 F1(weighted) 0.6419


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 15] Train loss 0.8393 acc 0.6913 | Val loss 1.0361 acc 0.6116 UA 0.6177 WA 0.6116 F1(macro) 0.5954 F1(weighted) 0.5919


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 16] Train loss 0.8125 acc 0.6990 | Val loss 0.8904 acc 0.6680 UA 0.6722 WA 0.6680 F1(macro) 0.6652 F1(weighted) 0.6613


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 17] Train loss 0.8055 acc 0.6995 | Val loss 0.9016 acc 0.6734 UA 0.6768 WA 0.6734 F1(macro) 0.6699 F1(weighted) 0.6670


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 18] Train loss 0.7962 acc 0.7071 | Val loss 0.8695 acc 0.6720 UA 0.6776 WA 0.6720 F1(macro) 0.6670 F1(weighted) 0.6642


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 19] Train loss 0.7591 acc 0.7210 | Val loss 0.8419 acc 0.6828 UA 0.6864 WA 0.6828 F1(macro) 0.6852 F1(weighted) 0.6819


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 20] Train loss 0.7660 acc 0.7151 | Val loss 0.9208 acc 0.6868 UA 0.6916 WA 0.6868 F1(macro) 0.6780 F1(weighted) 0.6749


                                                     


=== TEST RESULTS (CREMA-D) ===
Train loss 0.8810 acc 0.6640 | Val loss 0.9208 acc 0.6868 UA 0.6689 WA 0.6640 F1(macro) 0.6558 F1(weighted) 0.6527




In [None]:
"""Finetune wavlm with diff optimize for wavlm and encoder/classifier"""
def main():
    # Hàm chính để chạy huấn luyện và đánh giá
    set_deterministic() # Đặt seed
    cfg = TrainConfig(
        data_root=audio_path, # Đường dẫn đến thư mục audio
        epochs=20,
        batch_size=8,
        lr=2e-4,
        freeze_wavlm=True # Đóng băng WavLM ở giai đoạn đầu
    )
    print(cfg)

    # Khởi tạo Dataset cho train, val, test
    train_ds = CREMADataset(cfg.data_root, "train", sr=cfg.sr, val_ratio=cfg.val_ratio, test_ratio=cfg.test_ratio)
    val_ds   = CREMADataset(cfg.data_root, "val",   sr=cfg.sr, val_ratio=cfg.val_ratio, test_ratio=cfg.test_ratio)
    test_ds  = CREMADataset(cfg.data_root, "test",  sr=cfg.sr, val_ratio=cfg.val_ratio, test_ratio=cfg.test_ratio)

    # Khởi tạo DataLoader
    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=4, collate_fn=collate_batch, pin_memory=True)
    val_loader   = DataLoader(val_ds,   batch_size=cfg.batch_size, shuffle=False, num_workers=4, collate_fn=collate_batch, pin_memory=True)
    test_loader  = DataLoader(test_ds,  batch_size=cfg.batch_size, shuffle=False, num_workers=4, collate_fn=collate_batch, pin_memory=True)

    # Khởi tạo mô hình
    model = SER_WavLM_WDEE(
        wavlm_name=cfg.wavlm_name,
        wdee_hidden=cfg.wdee_hidden,
        wdee_out=cfg.wdee_out,
        freeze_wavlm=cfg.freeze_wavlm
    ).to(cfg.device) # Chuyển model lên device

    # Khởi tạo optimizer và scaler (cho mixed precision)
    optim = torch.optim.AdamW([
    {'params': model.wavlm.parameters(), 'lr': 1e-5},
    {'params': model.encoder.parameters(), 'lr': 1e-4},
    {'params': model.cls.parameters(), 'lr': 1e-4}
], weight_decay=1e-4)
    scaler = torch.cuda.amp.GradScaler(enabled=(cfg.device.startswith("cuda") and cfg.mixed_precision))

    best_val_ua, best_state = -1.0, None
    # Vòng lặp huấn luyện qua các epoch
    for epoch in range(1, cfg.epochs+1):
        tr_loss, tr_acc = train_one_epoch(model, train_loader, optim, scaler, cfg) # Huấn luyện 1 epoch
        (
          val_loss,
          val_acc,
          val_UA,
          val_WA,
          _,
          val_f1_macro,
          val_f1_weighted,
          val_prec_macro,
          val_prec_weighted,
          val_rec_macro,
          val_rec_weighted,
          _
      ) = evaluate(model, val_loader, cfg)

        print(
            f"[Epoch {epoch:02d}] "
            f"Train loss {tr_loss:.4f} acc {tr_acc:.4f} | "
            f"Val loss {val_loss:.4f} acc {val_acc:.4f} "
            f"UA {val_UA:.4f} WA {val_WA:.4f} "
            f"F1(macro) {val_f1_macro:.4f} F1(weighted) {val_f1_weighted:.4f}"
        )
        # Lưu trạng thái mô hình nếu đạt UA tốt nhất trên tập validation
        if val_UA > best_val_ua:
            best_val_ua = val_UA
            best_state = {k:v.cpu() for k,v in model.state_dict().items()}

    # Load lại trạng thái mô hình tốt nhất từ validation
    if best_state is not None:
        model.load_state_dict({k:v.to(cfg.device) for k,v in best_state.items()})

    # Đánh giá cuối cùng trên tập test
    (
          te_loss,
          te_acc,
          te_UA,
          te_WA,
          _,
          te_f1_macro,
          te_f1_weighted,
          te_prec_macro,
          te_prec_weighted,
          te_rec_macro,
          te_rec_weighted,
          _
      ) = evaluate(model, test_loader, cfg)
    print("\n=== TEST RESULTS (CREMA-D) ===")
    print(f"Train loss {te_loss:.4f} acc {te_acc:.4f} | "
            f"Val loss {val_loss:.4f} acc {val_acc:.4f} "
            f"UA {te_UA:.4f} WA {te_WA:.4f} "
            f"F1(macro) {te_f1_macro:.4f} F1(weighted) {te_f1_weighted:.4f}")
    # # In ra confusion matrix có nhãn
    # df_cm = pd.DataFrame(cm, index=[IDX2EMO[i] for i in range(6)], columns=[IDX2EMO[i] for i in range(6)])
    # print("\nConfusion matrix:")
    # print(df_cm)



In [None]:
if __name__ == "__main__":
    main() # Chạy hàm main khi script được thực thi trực tiếp

TrainConfig(data_root='/root/.cache/kagglehub/datasets/akaiinu/crema-d/versions/1/cremad/AudioWAV', batch_size=8, epochs=20, lr=0.0002, freeze_wavlm=True, device='cuda', wavlm_name='microsoft/wavlm-base-plus', wdee_hidden=512, wdee_out=256, sr=16000, val_ratio=0.1, test_ratio=0.1, grad_clip=1.0, mixed_precision=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  scaler = torch.cuda.amp.GradScaler(enabled=(cfg.device.startswith("cuda") and cfg.mixed_precision))
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[Epoch 01] Train loss 1.5989 acc 0.3275 | Val loss 1.4571 acc 0.4126 UA 0.4029 WA 0.4126 F1(macro) 0.3286 F1(weighted) 0.3366


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[Epoch 02] Train loss 1.4383 acc 0.4043 | Val loss 1.4020 acc 0.4382 UA 0.4395 WA 0.4382 F1(macro) 0.3587 F1(weighted) 0.3576


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 03] Train loss 1.3895 acc 0.4259 | Val loss 1.3437 acc 0.4637 UA 0.4593 WA 0.4637 F1(macro) 0.4148 F1(weighted) 0.4174


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 04] Train loss 1.3331 acc 0.4599 | Val loss 1.3340 acc 0.4637 UA 0.4612 WA 0.4637 F1(macro) 0.4318 F1(weighted) 0.4340


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 05] Train loss 1.2742 acc 0.4881 | Val loss 1.2221 acc 0.5215 UA 0.5231 WA 0.5215 F1(macro) 0.4887 F1(weighted) 0.4891


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 06] Train loss 1.1832 acc 0.5366 | Val loss 1.1592 acc 0.5645 UA 0.5690 WA 0.5645 F1(macro) 0.5396 F1(weighted) 0.5381


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 07] Train loss 1.1055 acc 0.5715 | Val loss 1.2242 acc 0.5296 UA 0.5357 WA 0.5296 F1(macro) 0.5070 F1(weighted) 0.5051


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 08] Train loss 1.0535 acc 0.5972 | Val loss 1.0811 acc 0.6008 UA 0.6089 WA 0.6008 F1(macro) 0.5826 F1(weighted) 0.5811


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 09] Train loss 1.0355 acc 0.6045 | Val loss 1.0509 acc 0.5927 UA 0.5941 WA 0.5927 F1(macro) 0.5883 F1(weighted) 0.5855


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 10] Train loss 0.9961 acc 0.6234 | Val loss 1.0229 acc 0.6169 UA 0.6234 WA 0.6169 F1(macro) 0.6063 F1(weighted) 0.6028


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 11] Train loss 0.9847 acc 0.6297 | Val loss 1.0100 acc 0.6048 UA 0.6111 WA 0.6048 F1(macro) 0.5941 F1(weighted) 0.5909


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 12] Train loss 0.9662 acc 0.6349 | Val loss 1.0487 acc 0.6102 UA 0.6136 WA 0.6102 F1(macro) 0.5977 F1(weighted) 0.5945


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 13] Train loss 0.9463 acc 0.6384 | Val loss 1.0718 acc 0.5981 UA 0.6065 WA 0.5981 F1(macro) 0.5792 F1(weighted) 0.5765


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 14] Train loss 0.9428 acc 0.6443 | Val loss 0.9972 acc 0.6116 UA 0.6166 WA 0.6116 F1(macro) 0.6029 F1(weighted) 0.5998


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 15] Train loss 0.9175 acc 0.6544 | Val loss 0.9401 acc 0.6438 UA 0.6501 WA 0.6438 F1(macro) 0.6418 F1(weighted) 0.6384


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 16] Train loss 0.9199 acc 0.6545 | Val loss 0.9602 acc 0.6438 UA 0.6509 WA 0.6438 F1(macro) 0.6333 F1(weighted) 0.6301


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 17] Train loss 0.9156 acc 0.6577 | Val loss 0.9167 acc 0.6599 UA 0.6656 WA 0.6599 F1(macro) 0.6549 F1(weighted) 0.6522


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 18] Train loss 0.9062 acc 0.6607 | Val loss 0.9767 acc 0.6371 UA 0.6426 WA 0.6371 F1(macro) 0.6292 F1(weighted) 0.6260


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 19] Train loss 0.8857 acc 0.6722 | Val loss 0.9307 acc 0.6492 UA 0.6551 WA 0.6492 F1(macro) 0.6443 F1(weighted) 0.6398


  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):


[Epoch 20] Train loss 0.8779 acc 0.6701 | Val loss 0.9290 acc 0.6640 UA 0.6708 WA 0.6640 F1(macro) 0.6557 F1(weighted) 0.6523


                                                     


=== TEST RESULTS (CREMA-D) ===
Train loss 0.8738 acc 0.6815 | Val loss 0.9290 acc 0.6640 UA 0.6864 WA 0.6815 F1(macro) 0.6776 F1(weighted) 0.6752




Origin:
Train loss 0.8810 acc 0.6640 | Val loss 0.9208 acc 0.6868 UA 0.6689 WA 0.6640 F1(macro) 0.6558 F1(weighted) 0.6527

Finetune wavlm with diff optimize for wavlm and encoder/classifier:
Train loss 0.8738 acc 0.6815 | Val loss 0.9290 acc 0.6640 UA 0.6864 WA 0.6815 F1(macro) 0.6776 F1(weighted) 0.6752


In [None]:
# =========================
# Utils
# =========================
SEED = 1337
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# Ánh xạ nhãn cảm xúc sang số nguyên
EMO_MAP = {"ANG":0, "DIS":1, "FEA":2, "HAP":3, "NEU":4, "SAD":5}
# Ánh xạ ngược từ số nguyên sang nhãn cảm xúc
IDX2EMO = {v:k for k,v in EMO_MAP.items()}

def set_deterministic():
    # Đặt seed để kết quả có thể tái lập
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def parse_label_from_filename(fname: str) -> int:
    # Phân tích tên file để lấy nhãn cảm xúc
    # Định dạng CREMA-D: ID_Sentence_Emotion_Intensity.wav
    # Ví dụ: 1001_DFA_ANG_XX.wav
    parts = os.path.basename(fname).split("_")
    emo = parts[2]
    return EMO_MAP[emo] # Trả về số nguyên tương ứng với cảm xúc

def compute_UA_WA(y_true, y_pred, n_classes=6):
    # Tính Unweighted Accuracy (UA) và Weighted Accuracy (WA)
    # UA: trung bình recall trên các lớp; WA: độ chính xác tổng thể
    cm = confusion_matrix(y_true, y_pred, labels=list(range(n_classes)))
    per_class_acc = []
    for c in range(n_classes):
        denom = cm[c].sum()
        per_class_acc.append((cm[c,c] / denom) if denom>0 else 0.0)
    UA = float(np.mean(per_class_acc))
    WA = float(np.trace(cm)) / float(np.sum(cm))
    return UA, WA, cm

# =========================
# Dataset
# =========================
class CREMADataset(Dataset):
    # Lớp Dataset cho bộ dữ liệu CREMA-D
    def __init__(self, data_root: str, split: str, sr: int = 16000, val_ratio: float = 0.1, test_ratio: float = 0.1):
        assert split in ["train","val","test"]
        self.sr = sr # Tốc độ lấy mẫu mục tiêu
        # Lấy tất cả các file .wav trong thư mục data_root
        all_files = [os.path.join(data_root, f) for f in os.listdir(data_root) if f.lower().endswith(".wav")]
        all_files = sorted(all_files)
        # Chia tập dữ liệu theo cảm xúc để đảm bảo phân bố đều (Stratified split)
        emo_buckets = defaultdict(list)
        for f in all_files:
            try:
                lab = parse_label_from_filename(f)
                emo_buckets[lab].append(f)
            except Exception:
                continue

        train_files, val_files, test_files = [], [], []
        rng = random.Random(SEED)
        for lab, lst in emo_buckets.items():
            rng.shuffle(lst) # Xáo trộn file trong mỗi bucket cảm xúc
            n = len(lst)
            n_test = int(round(test_ratio*n))
            n_val  = int(round(val_ratio*n))
            # Chia file vào các tập train/val/test
            test_files += lst[:n_test]
            val_files  += lst[n_test:n_test+n_val]
            train_files += lst[n_test+n_val:]

        # Gán danh sách file cho tập dữ liệu hiện tại (train, val, hoặc test)
        if split=="train": self.files = train_files
        elif split=="val": self.files = val_files
        else: self.files = test_files

        # Bộ resampler nếu tốc độ lấy mẫu gốc khác mục tiêu
        self.resampler = torchaudio.transforms.Resample(orig_freq=44100, new_freq=sr)

    def __len__(self):
        # Trả về số lượng file trong tập dữ liệu
        return len(self.files)

    def __getitem__(self, idx):
        # Lấy mẫu dữ liệu theo index
        path = self.files[idx]
        y = parse_label_from_filename(path) # Lấy nhãn cảm xúc
        wav, sr = torchaudio.load(path) # Load file audio
        wav = wav.mean(dim=0, keepdim=True)  # Chuyển sang mono
        if sr != self.sr:
            # CREMA-D gốc 44.1kHz, resample nếu cần
            wav = torchaudio.functional.resample(wav, sr, self.sr)
        wav = wav.squeeze(0)  # Bỏ chiều kênh nếu là mono [T]
        # Chuẩn hóa năng lượng nhẹ về [-1, 1]
        wav = wav / (wav.abs().max() + 1e-9)
        return wav, y, path # Trả về waveform, nhãn và đường dẫn file

def collate_batch(batch):
    # Hàm collate để xử lý batch dữ liệu cho DataLoader
    # Pad các waveform trong batch về cùng độ dài lớn nhất
    wavs, labels, paths = zip(*batch)
    lengths = [len(w) for w in wavs] # Độ dài gốc của từng waveform
    maxlen = max(lengths) # Độ dài lớn nhất trong batch
    padded = []
    for w in wavs:
        if len(w) < maxlen:
            # Pad waveform nếu ngắn hơn maxlen
            w = F.pad(w, (0, maxlen - len(w)))
        padded.append(w.unsqueeze(0)) # Thêm chiều batch
    audio = torch.cat(padded, dim=0)  # Nối các waveform đã pad thành một tensor [B, T]
    labels = torch.tensor(labels, dtype=torch.long) # Chuyển nhãn thành tensor
    return audio, labels, torch.tensor(lengths, dtype=torch.long), paths # Trả về batch

# =========================
# SL-WDEE Emotional Encoder
# =========================
class ContextualTransform(nn.Module):
    """
    Biến đổi đặc trưng frame-by-frame thành đặc trưng có ngữ cảnh (từ t-L đến t+R).
    Input:  [B, T, D] (Batch, Time, Dimension)
    Output: [B, T, D*(L+R+1)]
    """
    def __init__(self, left=5, right=5):
        super().__init__()
        self.l = left # Số frame ngữ cảnh bên trái
        self.r = right # Số frame ngữ cảnh bên phải

    def forward(self, x):
        # x: [B, T, D]
        B, T, D = x.shape
        pads = (self.l, self.r)  # Số lượng padding theo chiều thời gian T
        # Pad tensor theo chiều thời gian bằng cách lặp lại giá trị ở biên (mode="replicate")
        x_pad = F.pad(x.transpose(1,2), pad=(self.l, self.r), mode="replicate").transpose(1,2)  # [B, T+L+R, D]
        ctx_list = []
        # Trích xuất các dải frame ngữ cảnh
        for offset in range(-self.l, self.r+1):
            start = (self.l + offset)
            end = start + T
            ctx_list.append(x_pad[:, start:end, :])  # [B, T, D]
        # Nối các dải ngữ cảnh lại theo chiều cuối cùng (dimension)
        ctx = torch.cat(ctx_list, dim=-1)  # [B, T, D*(L+R+1)]
        return ctx

class EmotionalEncoder(nn.Module):
    """
    Mô-đun Emotional Encoder theo bài báo SL-WDEE:
    Layer Normalization -> Contextual Transform -> Linear -> Sigmoid -> Linear -> Sigmoid.
    Trả về đặc trưng theo từng frame (SL-WDEE_t) và đặc trưng mức utterance (pooling).
    """
    def __init__(self, in_dim, hidden_dim=512, out_dim=256, left=5, right=5):
        super().__init__()
        self.norm = nn.LayerNorm(in_dim) # Layer Normalization
        self.ctx = ContextualTransform(left, right) # Biến đổi ngữ cảnh
        self.fc1 = nn.Linear(in_dim*(left+right+1), hidden_dim) # Lớp Linear thứ nhất
        self.fc2 = nn.Linear(hidden_dim, out_dim) # Lớp Linear thứ hai

    def forward(self, x):
        # x: [B, T, D] (đặc trưng từ WavLM)
        x = self.norm(x) # Áp dụng Layer Normalization
        x = self.ctx(x)                 # Áp dụng biến đổi ngữ cảnh [B, T, D*11]
        x = torch.sigmoid(self.fc1(x))  # Lớp Linear 1 + Sigmoid [B, T, H]
        x = torch.sigmoid(self.fc2(x))  # Lớp Linear 2 + Sigmoid [B, T, out]
        # Pooling mức Utterance (trung bình cộng qua chiều thời gian)
        wdee_utt = x.mean(dim=1)       # [B, out]
        return x, wdee_utt  # Trả về đặc trưng frame-level và utterance-level

# =========================
# Classifier (utterance level)
# =========================
class MLPClassifier(nn.Module):
    # Bộ phân loại MLP đơn giản cho đặc trưng mức utterance
    def __init__(self, in_dim=256, num_classes=6):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 256),     # Lớp Linear 1
            nn.ReLU(inplace=True),      # Hàm kích hoạt ReLU
            nn.Dropout(0.2),            # Dropout để chống overfitting
            nn.Linear(256, 128),        # Lớp Linear 2
            nn.ReLU(inplace=True),      # Hàm kích hoạt ReLU
            nn.Dropout(0.2),            # Dropout
            nn.Linear(128, num_classes) # Lớp Linear cuối cùng ra số lớp
        )
    def forward(self, x):
        # x: [B, in_dim] (đặc trưng mức utterance)
        return self.net(x) # Trả về logits

# =========================
# SER Model (WavLM -> SL-WDEE -> Classifier)
# =========================
class SER_WavLM_WDEE(nn.Module):
    # Mô hình nhận dạng cảm xúc giọng nói kết hợp WavLM và SL-WDEE
    def __init__(self, wavlm_name="microsoft/wavlm-base-plus",
                 wdee_hidden=512, wdee_out=256, freeze_wavlm=True):
        super().__init__()
        # Tải mô hình WavLM từ Hugging Face Transformers
        self.wavlm = AutoModel.from_pretrained(wavlm_name)
        hidden = self.wavlm.config.hidden_size # Kích thước đặc trưng đầu ra của WavLM (768)
        # Khởi tạo Emotional Encoder
        self.encoder = EmotionalEncoder(in_dim=hidden, hidden_dim=wdee_hidden, out_dim=wdee_out)
        # Khởi tạo bộ phân loại MLP
        self.cls = MLPClassifier(in_dim=wdee_out, num_classes=6)
        # Tùy chọn đóng băng (freeze) các tham số của WavLM
        if freeze_wavlm:
            for p in self.wavlm.parameters():
                p.requires_grad = False


    @torch.no_grad()
    def _extract_wavlm_features(self, wav_batch, lengths):
        # Trích xuất đặc trưng từ WavLM (không tính gradient)
        # wav_batch: [B, T] float32, đã chuẩn hoá [-1,1]
        B, T = wav_batch.shape
        lengths = lengths.to(wav_batch.device)    #to device
        # Tạo attention_mask: 1 cho phần có dữ liệu gốc, 0 cho phần padding
        attention_mask = (torch.arange(T, device=wav_batch.device)[None, :] < lengths[:, None]).long()
        # Chạy WavLM
        outputs = self.wavlm(input_values=wav_batch, attention_mask=attention_mask, output_hidden_states=False)
        feats = outputs.last_hidden_state  # Lấy đặc trưng từ lớp cuối cùng [B, T', D] (T' là số frame sau WavLM)
        return feats


    def forward(self, wav_batch, lengths):
          # Chạy forward pass của toàn bộ mô hình
          feats = self._extract_wavlm_features(wav_batch, lengths)   # Trích xuất đặc trưng WavLM [B,T',D]
          per_frame_wdee, utt_wdee = self.encoder(feats)              # Chạy qua Emotional Encoder [B,T',E], [B,E]
          logits = self.cls(utt_wdee)                                 # Chạy qua bộ phân loại [B,6]
          return logits, utt_wdee # Trả về logits dự đoán cảm xúc và đặc trưng mức utterance WDEE

# =========================
# Training / Evaluation
# =========================
@dataclass
class TrainConfig:
    # Lớp cấu hình cho quá trình huấn luyện
    data_root: str
    batch_size: int = 8
    epochs: int = 20
    lr: float = 2e-4
    freeze_wavlm: bool = True
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    wavlm_name: str = "microsoft/wavlm-base-plus"
    wdee_hidden: int = 512
    wdee_out: int = 256
    sr: int = 16000
    val_ratio: float = 0.1
    test_ratio: float = 0.1
    grad_clip: float = 1.0
    mixed_precision: bool = True # Sử dụng mixed precision training để tăng tốc (nếu có GPU)

def train_one_epoch(model, loader, optim, scaler, cfg):
    # Hàm huấn luyện một epoch
    model.train() # Đặt model ở chế độ train
    total_loss, total, correct = 0.0, 0, 0
    for wav, y, lengths, _ in tqdm(loader, desc="Train", leave=False):
        wav, y = wav.to(cfg.device), y.to(cfg.device) # Chuyển dữ liệu lên device
        optim.zero_grad(set_to_none=True) # Reset gradient
        if cfg.mixed_precision:
            # Sử dụng mixed precision nếu bật và có GPU
            with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
                logits, _ = model(wav, lengths) # Forward pass
                loss = F.cross_entropy(logits, y) # Tính loss (Cross-Entropy)
            scaler.scale(loss).backward() # Scale loss và backward
            scaler.unscale_(optim) # Unscale gradient trước khi clip
            if cfg.grad_clip is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip) # Clip gradient
            scaler.step(optim) # Cập nhật tham số
            scaler.update() # Cập nhật scaler
        else:
            # Huấn luyện thông thường (full precision)
            logits, _ = model(wav, lengths)
            loss = F.cross_entropy(logits, y)
            loss.backward()
            if cfg.grad_clip is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip)
            optim.step()

        total_loss += float(loss.item()) * len(y) # Cộng dồn loss
        pred = logits.argmax(dim=1) # Lấy dự đoán
        correct += int((pred==y).sum().item()) # Đếm số lượng dự đoán đúng
        total += len(y) # Tổng số mẫu trong batch
    return total_loss/total, correct/total # Trả về loss và accuracy trung bình epoch

@torch.no_grad()
def evaluate(model, loader, cfg):
    # Hàm đánh giá mô hình (không tính gradient)
    model.eval()
    total_loss, total, correct = 0.0, 0, 0
    y_true, y_pred = [], []

    for wav, y, lengths, _ in tqdm(loader, desc="Eval", leave=False):
        wav, y = wav.to(cfg.device), y.to(cfg.device)
        logits, _ = model(wav, lengths)
        loss = F.cross_entropy(logits, y)
        total_loss += float(loss.item()) * len(y)

        pred = logits.argmax(dim=1)
        correct += int((pred == y).sum().item())
        total += len(y)

        y_true += y.detach().cpu().tolist()
        y_pred += pred.detach().cpu().tolist()

    # UA, WA, Confusion Matrix
    UA, WA, cm = compute_UA_WA(y_true, y_pred, n_classes=6)

    # Precision / Recall / F1
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    weighted_f1 = f1_score(y_true, y_pred, average="weighted")

    macro_precision = precision_score(y_true, y_pred, average="macro")
    weighted_precision = precision_score(y_true, y_pred, average="weighted")

    macro_recall = recall_score(y_true, y_pred, average="macro")
    weighted_recall = recall_score(y_true, y_pred, average="weighted")

    # (tuỳ chọn) in luôn classification report dạng bảng
    cls_report = classification_report(
        y_true, y_pred,
        target_names=[IDX2EMO[i] for i in range(6)],
        digits=4
    )

    # Có thể in ra trong lúc evaluate (debug)
    # print(cls_report)

    # Trả thêm các metric mới
    return (
        total_loss / total,
        correct / total,
        UA,
        WA,
        cm,
        macro_f1,
        weighted_f1,
        macro_precision,
        weighted_precision,
        macro_recall,
        weighted_recall,
        cls_report,
    )

def main():
    # Hàm chính để chạy huấn luyện và đánh giá
    set_deterministic() # Đặt seed
    cfg = TrainConfig(
        data_root="/kaggle/input/crema-d/cremad/AudioWAV", # Đường dẫn đến thư mục audio
        epochs=20,
        batch_size=8,
        lr=2e-4,
        freeze_wavlm=True # Đóng băng WavLM ở giai đoạn đầu
    )
    print(cfg)

    # Khởi tạo Dataset cho train, val, test
    train_ds = CREMADataset(cfg.data_root, "train", sr=cfg.sr, val_ratio=cfg.val_ratio, test_ratio=cfg.test_ratio)
    val_ds   = CREMADataset(cfg.data_root, "val",   sr=cfg.sr, val_ratio=cfg.val_ratio, test_ratio=cfg.test_ratio)
    test_ds  = CREMADataset(cfg.data_root, "test",  sr=cfg.sr, val_ratio=cfg.val_ratio, test_ratio=cfg.test_ratio)

    # Khởi tạo DataLoader
    train_loader = DataLoader(train_ds, batch_size=cfg.batch_size, shuffle=True, num_workers=4, collate_fn=collate_batch, pin_memory=True)
    val_loader   = DataLoader(val_ds,   batch_size=cfg.batch_size, shuffle=False, num_workers=4, collate_fn=collate_batch, pin_memory=True)
    test_loader  = DataLoader(test_ds,  batch_size=cfg.batch_size, shuffle=False, num_workers=4, collate_fn=collate_batch, pin_memory=True)

    # Khởi tạo mô hình
    model = SER_WavLM_WDEE(
        wavlm_name=cfg.wavlm_name,
        wdee_hidden=cfg.wdee_hidden,
        wdee_out=cfg.wdee_out,
        freeze_wavlm=cfg.freeze_wavlm
    ).to(cfg.device) # Chuyển model lên device

    # Khởi tạo optimizer và scaler (cho mixed precision)
    optim = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=cfg.lr)
    scaler = torch.cuda.amp.GradScaler(enabled=(cfg.device.startswith("cuda") and cfg.mixed_precision))

    best_val_ua, best_state = -1.0, None
    # Vòng lặp huấn luyện qua các epoch
    for epoch in range(1, cfg.epochs+1):
        tr_loss, tr_acc = train_one_epoch(model, train_loader, optim, scaler, cfg) # Huấn luyện 1 epoch
        (
          val_loss,
          val_acc,
          val_UA,
          val_WA,
          _,
          val_f1_macro,
          val_f1_weighted,
          val_prec_macro,
          val_prec_weighted,
          val_rec_macro,
          val_rec_weighted,
          _
      ) = evaluate(model, val_loader, cfg)

        print(
            f"[Epoch {epoch:02d}] "
            f"Train loss {tr_loss:.4f} acc {tr_acc:.4f} | "
            f"Val loss {val_loss:.4f} acc {val_acc:.4f} "
            f"UA {val_UA:.4f} WA {val_WA:.4f} "
            f"F1(macro) {val_f1_macro:.4f} F1(weighted) {val_f1_weighted:.4f}"
        )
        # Lưu trạng thái mô hình nếu đạt UA tốt nhất trên tập validation
        if val_UA > best_val_ua:
            best_val_ua = val_UA
            best_state = {k:v.cpu() for k,v in model.state_dict().items()}

    # Load lại trạng thái mô hình tốt nhất từ validation
    if best_state is not None:
        model.load_state_dict({k:v.to(cfg.device) for k,v in best_state.items()})

    # Đánh giá cuối cùng trên tập test
    (
          te_loss,
          te_acc,
          te_UA,
          te_WA,
          _,
          te_f1_macro,
          te_f1_weighted,
          te_prec_macro,
          te_prec_weighted,
          te_rec_macro,
          te_rec_weighted,
          _
      ) = evaluate(model, test_loader, cfg)
    print("\n=== TEST RESULTS (CREMA-D) ===")
    print(f"Train loss {te_loss:.4f} acc {te_acc:.4f} | "
            f"Val loss {val_loss:.4f} acc {val_acc:.4f} "
            f"UA {te_UA:.4f} WA {te_WA:.4f} "
            f"F1(macro) {te_f1_macro:.4f} F1(weighted) {te_f1_weighted:.4f}")
    # In ra confusion matrix có nhãn
    # df_cm = pd.DataFrame(cm, index=[IDX2EMO[i] for i in range(6)], columns=[IDX2EMO[i] for i in range(6)])
    # print("\nConfusion matrix:")
    # print(df_cm)

if __name__ == "__main__":
    main() # Chạy hàm main khi script được thực thi trực tiếp

TrainConfig(data_root='/kaggle/input/crema-d/cremad/AudioWAV', batch_size=8, epochs=20, lr=0.0002, freeze_wavlm=True, device='cuda', wavlm_name='microsoft/wavlm-base-plus', wdee_hidden=512, wdee_out=256, sr=16000, val_ratio=0.1, test_ratio=0.1, grad_clip=1.0, mixed_precision=True)


Some weights of the model checkpoint at microsoft/wavlm-base-plus were not used when initializing WavLMModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMModel were not initialized from the model checkpoint at microsoft/wavlm-base-plus and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

[Epoch 01] Train loss 1.5487 acc 0.3534 | Val loss 1.4426 acc 0.4086 UA 0.4115 WA 0.4086 F1(macro) 0.3375 F1(weighted) 0.3373


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 02] Train loss 1.3898 acc 0.4365 | Val loss 1.3005 acc 0.4785 UA 0.4789 WA 0.4785 F1(macro) 0.4343 F1(weighted) 0.4345


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 03] Train loss 1.2398 acc 0.5091 | Val loss 1.2531 acc 0.5067 UA 0.5069 WA 0.5067 F1(macro) 0.4858 F1(weighted) 0.4840


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 04] Train loss 1.1177 acc 0.5700 | Val loss 1.0815 acc 0.5712 UA 0.5788 WA 0.5712 F1(macro) 0.5599 F1(weighted) 0.5576


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 05] Train loss 1.0559 acc 0.5979 | Val loss 1.2270 acc 0.5417 UA 0.5482 WA 0.5417 F1(macro) 0.5196 F1(weighted) 0.5152


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 06] Train loss 1.0186 acc 0.6065 | Val loss 1.0771 acc 0.5954 UA 0.6048 WA 0.5954 F1(macro) 0.5778 F1(weighted) 0.5759


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 07] Train loss 0.9850 acc 0.6261 | Val loss 1.0966 acc 0.5927 UA 0.6019 WA 0.5927 F1(macro) 0.5710 F1(weighted) 0.5681


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 08] Train loss 0.9742 acc 0.6352 | Val loss 1.1124 acc 0.5874 UA 0.5960 WA 0.5874 F1(macro) 0.5653 F1(weighted) 0.5617


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 09] Train loss 0.9457 acc 0.6443 | Val loss 0.9954 acc 0.6277 UA 0.6350 WA 0.6277 F1(macro) 0.6139 F1(weighted) 0.6101


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 10] Train loss 0.9375 acc 0.6528 | Val loss 0.9735 acc 0.6358 UA 0.6435 WA 0.6358 F1(macro) 0.6229 F1(weighted) 0.6203


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 11] Train loss 0.9119 acc 0.6604 | Val loss 0.9391 acc 0.6586 UA 0.6662 WA 0.6586 F1(macro) 0.6491 F1(weighted) 0.6465


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 12] Train loss 0.8863 acc 0.6680 | Val loss 0.8919 acc 0.6734 UA 0.6783 WA 0.6734 F1(macro) 0.6745 F1(weighted) 0.6706


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 13] Train loss 0.8722 acc 0.6696 | Val loss 0.8813 acc 0.6815 UA 0.6872 WA 0.6815 F1(macro) 0.6811 F1(weighted) 0.6785


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 14] Train loss 0.8451 acc 0.6853 | Val loss 0.9574 acc 0.6505 UA 0.6573 WA 0.6505 F1(macro) 0.6418 F1(weighted) 0.6397


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 15] Train loss 0.8425 acc 0.6935 | Val loss 0.9218 acc 0.6411 UA 0.6446 WA 0.6411 F1(macro) 0.6344 F1(weighted) 0.6318


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 16] Train loss 0.8168 acc 0.6980 | Val loss 0.9202 acc 0.6599 UA 0.6636 WA 0.6599 F1(macro) 0.6636 F1(weighted) 0.6598


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 17] Train loss 0.8017 acc 0.7037 | Val loss 0.9078 acc 0.6613 UA 0.6658 WA 0.6613 F1(macro) 0.6567 F1(weighted) 0.6533


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 18] Train loss 0.7978 acc 0.7031 | Val loss 0.9461 acc 0.6680 UA 0.6743 WA 0.6680 F1(macro) 0.6598 F1(weighted) 0.6565


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 19] Train loss 0.7704 acc 0.7167 | Val loss 0.9670 acc 0.6667 UA 0.6741 WA 0.6667 F1(macro) 0.6565 F1(weighted) 0.6554


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  with torch.cuda.amp.autocast(enabled=(cfg.device.startswith("cuda"))):
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


[Epoch 20] Train loss 0.7772 acc 0.7158 | Val loss 0.8192 acc 0.7003 UA 0.7050 WA 0.7003 F1(macro) 0.6968 F1(weighted) 0.6940


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
                                                     


=== TEST RESULTS (CREMA-D) ===
Train loss 0.7742 acc 0.7137 | Val loss 0.8192 acc 0.7003 UA 0.7172 WA 0.7137 F1(macro) 0.7114 F1(weighted) 0.7098




NameError: name 'cm' is not defined

In [None]:
# -------------------------
# 1. Install & imports
# -------------------------
!pip install -q librosa soundfile tensorflow scikit-learn

In [None]:
# ============================================================
#  CREMA-D SER pipeline (hand-crafted features + CNN/BiLSTM ensemble)
#  with feature caching (save & reload X, y, splits)
# ============================================================

import os
import glob
import numpy as np
import librosa
import librosa.display
import random
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.layers import (Input, Conv1D, MaxPooling1D, Dropout, BatchNormalization,
                                     Flatten, Dense, Bidirectional, LSTM, Average)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# -------------------------
# 2. Config
# -------------------------
DATA_DIR     = "/root/.cache/kagglehub/datasets/akaiinu/crema-d/versions/1/cremad/AudioWAV"
SAMPLE_RATE  = 22050
DURATION     = 2.5     # seconds (paper: 2.5s)
OFFSET       = 0.6     # seconds (skip a bit of the start)
FRAME_LENGTH = 2048
HOP_LENGTH   = 512
N_MFCC       = 40
PITCH_STEPS  = 0.7
NOISE_ALPHA  = 0.035

RANDOM_SEED  = 42
BATCH_SIZE   = 64
EPOCHS       = 100

# where to store cached features
FEATURE_DIR  = "/content/cremad_features"
os.makedirs(FEATURE_DIR, exist_ok=True)
X_PATH       = os.path.join(FEATURE_DIR, "X_cremad.npy")
Y_PATH       = os.path.join(FEATURE_DIR, "y_cremad.npy")
SPLIT_PATH   = os.path.join(FEATURE_DIR, "splits_cremad.npy")

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

EMOTION_MAP = {
    "ANG": 0,
    "DIS": 1,
    "FEA": 2,
    "HAP": 3,
    "NEU": 4,
    "SAD": 5
}
IDX2EMO = {v: k for k, v in EMOTION_MAP.items()}

# -------------------------
# 3. Scan CREMA-D file list
#    Expect filenames like: 1001_DFA_ANG_XX.wav
# -------------------------
wav_paths = sorted(glob.glob(os.path.join(DATA_DIR, "*.wav")))
if len(wav_paths) == 0:
    raise RuntimeError(f"No .wav found in {DATA_DIR}. Check your DATA_DIR path.")

records = []
for p in wav_paths:
    fname = os.path.basename(p)
    parts = fname.split("_")
    # Example: 1001_DFA_ANG_XX.wav
    try:
        speaker_id = int(parts[0])
        emotion_code = parts[2]
    except Exception as e:
        print("Skipping file with unexpected name format:", fname, "error:", e)
        continue

    if emotion_code not in EMOTION_MAP:
        continue

    records.append({
        "path": p,
        "speaker": speaker_id,
        "emotion": emotion_code
    })

print(f"Found {len(records)} labeled files.")

# -------------------------
# 4. Speaker-wise split (speaker-independent)
#    80% train, 10% val, 10% test by speakers
# -------------------------
speakers = sorted(list({r["speaker"] for r in records}))
n_spk = len(speakers)
print("Number of speakers:", n_spk)

train_end = int(0.8 * n_spk)
val_end   = int(0.9 * n_spk)
train_spk = set(speakers[:train_end])
val_spk   = set(speakers[train_end:val_end])
test_spk  = set(speakers[val_end:])

for r in records:
    if r["speaker"] in train_spk:
        r["split"] = "train"
    elif r["speaker"] in val_spk:
        r["split"] = "val"
    else:
        r["split"] = "test"

print("Samples per split:")
print("  Train:", sum(r["split"] == "train" for r in records))
print("  Val  :", sum(r["split"] == "val"   for r in records))
print("  Test :", sum(r["split"] == "test"  for r in records))

# -------------------------
# 5. Audio loading & augmentation
# -------------------------
def load_segment(path, sr=SAMPLE_RATE, duration=DURATION, offset=OFFSET):
    """
    Always return a segment of exactly `duration` seconds,
    starting at `offset` seconds.
    """
    y, _ = librosa.load(path, sr=sr)
    seg_len = int(duration * sr)
    start = int(offset * sr)

    # Ensure we have at least start + seg_len samples
    if len(y) < start + seg_len:
        pad_len = start + seg_len - len(y)
        y = np.pad(y, (0, pad_len))

    # Now slice exactly [start : start + seg_len]
    y = y[start:start + seg_len]

    # Safety check: force exact length
    if len(y) < seg_len:
        y = np.pad(y, (0, seg_len - len(y)))
    elif len(y) > seg_len:
        y = y[:seg_len]

    return y

def add_noise(y, alpha=NOISE_ALPHA):
    noise = np.random.randn(len(y))
    return y + alpha * noise

def pitch_shift(y, sr=SAMPLE_RATE, n_steps=PITCH_STEPS):
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)

# -------------------------
# 6. Feature extraction (ZCR, RMSE, MFCC, Chroma STFT)
# -------------------------
def extract_single_features(y, sr=SAMPLE_RATE):
    # Zero Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(
        y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH
    )
    # RMSE
    rmse = librosa.feature.rms(
        y=y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH
    )
    # MFCC
    mfcc = librosa.feature.mfcc(
        y=y, sr=sr, n_mfcc=N_MFCC,
        n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH
    )
    # Chroma STFT
    stft = librosa.stft(y, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)
    chroma = librosa.feature.chroma_stft(
        S=np.abs(stft), sr=sr
    )

    # zcr, rmse, mfcc, chroma each: (n_features, T)
    feat = np.concatenate([zcr, rmse, mfcc, chroma], axis=0)  # (F, T)
    return feat  # (F, T)

def extract_augmented_features(path):
    """Return feature 2D array (F_total, T) after concatenating 4 augmentations."""
    y0 = load_segment(path)

    y_noise = add_noise(y0)
    y_pitch = pitch_shift(y0)
    y_np    = add_noise(pitch_shift(y0))

    feats = []
    for y in (y0, y_noise, y_pitch, y_np):
        f = extract_single_features(y)  # (F, T)
        feats.append(f)

    # Concatenate along feature axis
    feat_cat = np.concatenate(feats, axis=0)  # (F_total, T)
    return feat_cat

# -------------------------
# 7. Build or load dataset: X (N, T, F), y (N,), splits (N,)
# -------------------------
if os.path.exists(X_PATH) and os.path.exists(Y_PATH) and os.path.exists(SPLIT_PATH):
    print("Loading cached features from disk...")
    X = np.load(X_PATH)
    y = np.load(Y_PATH)
    splits = np.load(SPLIT_PATH)
    print("Loaded:")
    print("  X shape:", X.shape)
    print("  y shape:", y.shape)
else:
    print("Extracting features (this may take a while...)")
    X_list = []
    y_list = []
    split_list = []

    for r in tqdm(records):
        feat_2d = extract_augmented_features(r["path"])  # (F, T)
        # Transpose to (T, F) for Keras Conv1D/LSTM
        feat_2d = feat_2d.T
        X_list.append(feat_2d)
        y_list.append(EMOTION_MAP[r["emotion"]])
        split_list.append(r["split"])

    # Check shape consistency
    shapes = {x.shape for x in X_list}
    if len(shapes) != 1:
        print("Warning: not all feature shapes are the same:", shapes)
    input_shape = X_list[0].shape  # (T, F)
    print("Input feature shape (T, F):", input_shape)

    X = np.stack(X_list, axis=0)  # (N, T, F)
    y = np.array(y_list)
    splits = np.array(split_list)

    print("X raw shape:", X.shape)
    print("y shape:", y.shape)

    # -------------------------
    # 8. Feature scaling (StandardScaler on feature dimension)
    # -------------------------
    N, T, F = X.shape
    X_2d = X.reshape(-1, F)   # (N*T, F)
    scaler = StandardScaler()
    X_2d = scaler.fit_transform(X_2d)
    X = X_2d.reshape(N, T, F)

    # save cached features
    np.save(X_PATH, X)
    np.save(Y_PATH, y)
    np.save(SPLIT_PATH, splits)
    print("Saved cached features to:", FEATURE_DIR)

# -------------------------
# 9. Train/Val/Test split
# -------------------------
N, T, F = X.shape
print("Final feature shape (N, T, F):", X.shape)

train_mask = splits == "train"
val_mask   = splits == "val"
test_mask  = splits == "test"

X_train, y_train = X[train_mask], y[train_mask]
X_val,   y_val   = X[val_mask],   y[val_mask]
X_test,  y_test  = X[test_mask],  y[test_mask]

print("Final splits:")
print("  Train:", X_train.shape, y_train.shape)
print("  Val  :", X_val.shape,   y_val.shape)
print("  Test :", X_test.shape,  y_test.shape)

num_classes = len(EMOTION_MAP)
y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes)
y_val_cat   = tf.keras.utils.to_categorical(y_val,   num_classes)
y_test_cat  = tf.keras.utils.to_categorical(y_test,  num_classes)

# -------------------------
# 10. Define models: CNN & CNN+BiLSTM & Ensemble
# -------------------------
def build_cnn_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)
    x = Conv1D(128, kernel_size=5, padding="same", activation="relu")(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.2)(x)

    x = Conv1D(256, kernel_size=5, padding="same", activation="relu")(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.2)(x)

    x = Flatten()(x)
    x = Dense(128, activation="relu")(x)
    x = BatchNormalization()(x)
    outputs = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs, outputs, name="CNN_1D")
    return model

def build_cnn_bilstm_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)
    x = Conv1D(128, kernel_size=5, padding="same", activation="relu")(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.2)(x)

    x = Conv1D(256, kernel_size=5, padding="same", activation="relu")(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.2)(x)

    # Bi-LSTM
    x = Bidirectional(LSTM(64, return_sequences=False))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)

    x = Dense(128, activation="relu")(x)
    x = BatchNormalization()(x)
    outputs = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs, outputs, name="CNN_BiLSTM")
    return model

input_shape = (T, F)
cnn_model = build_cnn_model(input_shape, num_classes)
bilstm_model = build_cnn_bilstm_model(input_shape, num_classes)

# Ensemble: average the softmax outputs of both models
ens_input = Input(shape=input_shape)
cnn_out    = cnn_model(ens_input)
bilstm_out = bilstm_model(ens_input)
ens_out    = Average(name="ensemble_avg")([cnn_out, bilstm_out])
ensemble_model = Model(ens_input, ens_out, name="CNN_Ensemble")

# -------------------------
# 11. Compile & train (with LR scheduler + early stopping)
# -------------------------
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
ensemble_model.compile(
    optimizer=optimizer,
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

lr_scheduler = ReduceLROnPlateau(
    monitor="val_accuracy",
    factor=0.5,
    patience=3,
    min_lr=1e-5,
    verbose=1
)

early_stop = EarlyStopping(
    monitor="val_accuracy",
    patience=5,
    restore_best_weights=True,
    verbose=1
)

print(ensemble_model.summary())

history = ensemble_model.fit(
    X_train, y_train_cat,
    validation_data=(X_val, y_val_cat),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[lr_scheduler, early_stop],
    verbose=1
)

# -------------------------
# 12. Evaluation on test set
# -------------------------
y_prob = ensemble_model.predict(X_test)
y_pred = np.argmax(y_prob, axis=1)

print("\n=== TEST RESULTS ===")
print(classification_report(
    y_test, y_pred,
    target_names=[IDX2EMO[i] for i in range(num_classes)]
))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))


Found 7442 labeled files.
Number of speakers: 91
Samples per split:
  Train: 5885
  Val  : 737
  Test : 820
Extracting features (this may take a while...)


100%|██████████| 7442/7442 [19:46<00:00,  6.27it/s]


Input feature shape (T, F): (108, 216)
X raw shape: (7442, 108, 216)
y shape: (7442,)
Saved cached features to: /content/cremad_features
Final feature shape (N, T, F): (7442, 108, 216)
Final splits:
  Train: (5885, 108, 216) (5885,)
  Val  : (737, 108, 216) (737,)
  Test : (820, 108, 216) (820,)


None
Epoch 1/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 41ms/step - accuracy: 0.3538 - loss: 1.6881 - val_accuracy: 0.3094 - val_loss: 1.7881 - learning_rate: 0.0010
Epoch 2/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.5239 - loss: 1.2520 - val_accuracy: 0.4627 - val_loss: 1.4112 - learning_rate: 0.0010
Epoch 3/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.5884 - loss: 1.1180 - val_accuracy: 0.5142 - val_loss: 1.3418 - learning_rate: 0.0010
Epoch 4/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.6425 - loss: 0.9750 - val_accuracy: 0.4830 - val_loss: 1.3629 - learning_rate: 0.0010
Epoch 5/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.7210 - loss: 0.8429 - val_accuracy: 0.5034 - val_loss: 1.3661 - learning_rate: 0.0010
Epoch 6/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [None]:
!rm -rf /content/cremad_features


In [None]:
# ============================================================
#  CREMA-D SER pipeline (hand-crafted features + CNN/BiLSTM ensemble)
#  Improved version:
#   - Cache RAW features (no scaling)
#   - Scale using ONLY train set (no data leakage)
#   - Smaller, more regularized models to reduce overfitting
# ============================================================

import os
import glob
import numpy as np
import librosa
import librosa.display
import random
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.layers import (Input, Conv1D, MaxPooling1D, Dropout, BatchNormalization,
                                     Flatten, Dense, Bidirectional, LSTM, Average)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.regularizers import l2

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# -------------------------
# 2. Config
# -------------------------
DATA_DIR     = "/root/.cache/kagglehub/datasets/akaiinu/crema-d/versions/1/cremad/AudioWAV"
SAMPLE_RATE  = 22050
DURATION     = 2.5     # seconds (paper: 2.5s)
OFFSET       = 0.6     # seconds (skip a bit of the start)
FRAME_LENGTH = 2048
HOP_LENGTH   = 512
N_MFCC       = 40
PITCH_STEPS  = 0.7
NOISE_ALPHA  = 0.035

RANDOM_SEED  = 42
BATCH_SIZE   = 64
EPOCHS       = 100

# where to store cached RAW features
FEATURE_DIR   = "/content/cremad_features"
os.makedirs(FEATURE_DIR, exist_ok=True)
X_RAW_PATH    = os.path.join(FEATURE_DIR, "X_raw_cremad.npy")
Y_PATH        = os.path.join(FEATURE_DIR, "y_cremad.npy")
SPLIT_PATH    = os.path.join(FEATURE_DIR, "splits_cremad.npy")
SCALER_PATH   = os.path.join(FEATURE_DIR, "scaler_mean_var.npz")  # optional: save scaler stats

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

EMOTION_MAP = {
    "ANG": 0,
    "DIS": 1,
    "FEA": 2,
    "HAP": 3,
    "NEU": 4,
    "SAD": 5
}
IDX2EMO = {v: k for k, v in EMOTION_MAP.items()}

# -------------------------
# 3. Scan CREMA-D file list
#    Expect filenames like: 1001_DFA_ANG_XX.wav
# -------------------------
wav_paths = sorted(glob.glob(os.path.join(DATA_DIR, "*.wav")))
if len(wav_paths) == 0:
    raise RuntimeError(f"No .wav found in {DATA_DIR}. Check your DATA_DIR path.")

records = []
for p in wav_paths:
    fname = os.path.basename(p)
    parts = fname.split("_")
    # Example: 1001_DFA_ANG_XX.wav
    try:
        speaker_id = int(parts[0])
        emotion_code = parts[2]
    except Exception as e:
        print("Skipping file with unexpected name format:", fname, "error:", e)
        continue

    if emotion_code not in EMOTION_MAP:
        continue

    records.append({
        "path": p,
        "speaker": speaker_id,
        "emotion": emotion_code
    })

print(f"Found {len(records)} labeled files.")

# -------------------------
# 4. Speaker-wise split (speaker-independent)
#    80% train, 10% val, 10% test by speakers
# -------------------------
speakers = sorted(list({r["speaker"] for r in records}))
n_spk = len(speakers)
print("Number of speakers:", n_spk)

train_end = int(0.8 * n_spk)
val_end   = int(0.9 * n_spk)
train_spk = set(speakers[:train_end])
val_spk   = set(speakers[train_end:val_end])
test_spk  = set(speakers[val_end:])

for r in records:
    if r["speaker"] in train_spk:
        r["split"] = "train"
    elif r["speaker"] in val_spk:
        r["split"] = "val"
    else:
        r["split"] = "test"

print("Samples per split:")
print("  Train:", sum(r["split"] == "train" for r in records))
print("  Val  :", sum(r["split"] == "val"   for r in records))
print("  Test :", sum(r["split"] == "test"  for r in records))

# -------------------------
# 5. Audio loading & augmentation
# -------------------------
def load_segment(path, sr=SAMPLE_RATE, duration=DURATION, offset=OFFSET):
    """
    Always return a segment of exactly `duration` seconds,
    starting at `offset` seconds.
    """
    y, _ = librosa.load(path, sr=sr)
    seg_len = int(duration * sr)
    start = int(offset * sr)

    # Ensure we have at least start + seg_len samples
    if len(y) < start + seg_len:
        pad_len = start + seg_len - len(y)
        y = np.pad(y, (0, pad_len))

    # Now slice exactly [start : start + seg_len]
    y = y[start:start + seg_len]

    # Safety check: force exact length
    if len(y) < seg_len:
        y = np.pad(y, (0, seg_len - len(y)))
    elif len(y) > seg_len:
        y = y[:seg_len]

    return y

def add_noise(y, alpha=NOISE_ALPHA):
    noise = np.random.randn(len(y))
    return y + alpha * noise

def pitch_shift(y, sr=SAMPLE_RATE, n_steps=PITCH_STEPS):
    return librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)

# -------------------------
# 6. Feature extraction (ZCR, RMSE, MFCC, Chroma STFT)
# -------------------------
def extract_single_features(y, sr=SAMPLE_RATE):
    # Zero Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(
        y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH
    )
    # RMSE
    rmse = librosa.feature.rms(
        y=y, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH
    )
    # MFCC
    mfcc = librosa.feature.mfcc(
        y=y, sr=sr, n_mfcc=N_MFCC,
        n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH
    )
    # Chroma STFT
    stft = librosa.stft(y, n_fft=FRAME_LENGTH, hop_length=HOP_LENGTH)
    chroma = librosa.feature.chroma_stft(
        S=np.abs(stft), sr=sr
    )

    # zcr, rmse, mfcc, chroma each: (n_features, T)
    feat = np.concatenate([zcr, rmse, mfcc, chroma], axis=0)  # (F, T)
    return feat  # (F, T)

def extract_augmented_features(path):
    """Return feature 2D array (F_total, T) after concatenating 4 augmentations."""
    y0 = load_segment(path)

    y_noise = add_noise(y0)
    y_pitch = pitch_shift(y0)
    y_np    = add_noise(pitch_shift(y0))

    feats = []
    for y in (y0, y_noise, y_pitch, y_np):
        f = extract_single_features(y)  # (F, T)
        feats.append(f)

    # Concatenate along feature axis
    feat_cat = np.concatenate(feats, axis=0)  # (F_total, T)
    return feat_cat

# -------------------------
# 7. Build or load RAW dataset: X_raw (N, T, F), y (N,), splits (N,)
# -------------------------
X_RAW_PATH="/content/cremad_features/X_cremad.npy"
Y_PATH="/content/cremad_features/y_cremad.npy"
SPLIT_PATH="/content/cremad_features/splits_cremad.npy"
if os.path.exists(X_RAW_PATH) and os.path.exists(Y_PATH) and os.path.exists(SPLIT_PATH):
    print("Loading cached RAW features from disk...")
    X_raw = np.load(X_RAW_PATH)
    y = np.load(Y_PATH)
    splits = np.load(SPLIT_PATH)
    print("Loaded RAW:")
    print("  X_raw shape:", X_raw.shape)
    print("  y shape    :", y.shape)
else:
    print("Extracting RAW features (this may take a while...)")
    X_list = []
    y_list = []
    split_list = []

    for r in tqdm(records):
        feat_2d = extract_augmented_features(r["path"])  # (F, T)
        # Transpose to (T, F) for Keras Conv1D/LSTM
        feat_2d = feat_2d.T
        X_list.append(feat_2d)
        y_list.append(EMOTION_MAP[r["emotion"]])
        split_list.append(r["split"])

    shapes = {x.shape for x in X_list}
    if len(shapes) != 1:
        print("Warning: not all feature shapes are the same:", shapes)
    input_shape = X_list[0].shape  # (T, F)
    print("Input RAW feature shape (T, F):", input_shape)

    X_raw = np.stack(X_list, axis=0)  # (N, T, F)
    y = np.array(y_list)
    splits = np.array(split_list)

    print("X_raw shape:", X_raw.shape)
    print("y shape    :", y.shape)

    # Save RAW features only (no scaling)
    np.save(X_RAW_PATH, X_raw)
    np.save(Y_PATH, y)
    np.save(SPLIT_PATH, splits)
    print("Saved RAW features to:", FEATURE_DIR)

# -------------------------
# 8. Train/Val/Test split + scaling (ONLY on train)
# -------------------------
N, T, F = X_raw.shape
print("RAW feature shape (N, T, F):", X_raw.shape)

train_mask = splits == "train"
val_mask   = splits == "val"
test_mask  = splits == "test"

X_train_raw, y_train = X_raw[train_mask], y[train_mask]
X_val_raw,   y_val   = X_raw[val_mask],   y[val_mask]
X_test_raw,  y_test  = X_raw[test_mask],  y[test_mask]

print("RAW splits:")
print("  Train:", X_train_raw.shape, y_train.shape)
print("  Val  :", X_val_raw.shape,   y_val.shape)
print("  Test :", X_test_raw.shape,  y_test.shape)

# Fit scaler on TRAIN only (no leakage)
scaler = StandardScaler()
X_train_2d = X_train_raw.reshape(-1, F)
scaler.fit(X_train_2d)

# Optionally save scaler stats
np.savez(SCALER_PATH, mean=scaler.mean_, var=scaler.var_)

# Apply scaling
X_train = scaler.transform(X_train_raw.reshape(-1, F)).reshape(X_train_raw.shape)
X_val   = scaler.transform(X_val_raw.reshape(-1, F)).reshape(X_val_raw.shape)
X_test  = scaler.transform(X_test_raw.reshape(-1, F)).reshape(X_test_raw.shape)

print("Scaled splits:")
print("  Train:", X_train.shape)
print("  Val  :", X_val.shape)
print("  Test :", X_test.shape)

num_classes = len(EMOTION_MAP)
y_train_cat = tf.keras.utils.to_categorical(y_train, num_classes)
y_val_cat   = tf.keras.utils.to_categorical(y_val,   num_classes)
y_test_cat  = tf.keras.utils.to_categorical(y_test,  num_classes)

# -------------------------
# 9. Define models: smaller + more regularization
# -------------------------
def build_cnn_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)
    x = Conv1D(64, kernel_size=5, padding="same", activation="relu",
               kernel_regularizer=l2(1e-4))(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.4)(x)

    x = Conv1D(128, kernel_size=5, padding="same", activation="relu",
               kernel_regularizer=l2(1e-4))(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.4)(x)

    x = Flatten()(x)
    x = Dense(128, activation="relu", kernel_regularizer=l2(1e-4))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)
    outputs = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs, outputs, name="CNN_1D_small")
    return model

def build_cnn_bilstm_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)
    x = Conv1D(64, kernel_size=5, padding="same", activation="relu",
               kernel_regularizer=l2(1e-4))(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.4)(x)

    x = Conv1D(128, kernel_size=5, padding="same", activation="relu",
               kernel_regularizer=l2(1e-4))(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.4)(x)

    # Bi-LSTM
    x = Bidirectional(LSTM(32, return_sequences=False,
                           kernel_regularizer=l2(1e-4)))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)

    x = Dense(128, activation="relu", kernel_regularizer=l2(1e-4))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.4)(x)
    outputs = Dense(num_classes, activation="softmax")(x)

    model = Model(inputs, outputs, name="CNN_BiLSTM_small")
    return model

input_shape = (T, F)
cnn_model = build_cnn_model(input_shape, num_classes)
bilstm_model = build_cnn_bilstm_model(input_shape, num_classes)

# Ensemble: average the softmax outputs of both models
ens_input   = Input(shape=input_shape)
cnn_out     = cnn_model(ens_input)
bilstm_out  = bilstm_model(ens_input)
ens_out     = Average(name="ensemble_avg")([cnn_out, bilstm_out])
ensemble_model = Model(ens_input, ens_out, name="CNN_Ensemble")

# -------------------------
# 10. Compile & train (with LR scheduler + early stopping + checkpoint)
# -------------------------
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
ensemble_model.compile(
    optimizer=optimizer,
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

lr_scheduler = ReduceLROnPlateau(
    monitor="val_accuracy",
    factor=0.5,
    patience=3,
    min_lr=1e-5,
    verbose=1
)

early_stop = EarlyStopping(
    monitor="val_accuracy",
    patience=7,
    restore_best_weights=True,
    verbose=1
)

checkpoint = ModelCheckpoint(
    filepath=os.path.join(FEATURE_DIR, "best_ensemble.h5"),
    monitor="val_accuracy",
    save_best_only=True,
    save_weights_only=False,
    mode="max",
    verbose=1
)

print(ensemble_model.summary())

history = ensemble_model.fit(
    X_train, y_train_cat,
    validation_data=(X_val, y_val_cat),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[lr_scheduler, early_stop, checkpoint],
    verbose=1
)

# -------------------------
# 11. Evaluation on test set
# -------------------------
y_prob = ensemble_model.predict(X_test)
y_pred = np.argmax(y_prob, axis=1)

print("\n=== TEST RESULTS (speaker-independent) ===")
print(classification_report(
    y_test, y_pred,
    target_names=[IDX2EMO[i] for i in range(num_classes)]
))
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))


Found 7442 labeled files.
Number of speakers: 91
Samples per split:
  Train: 5885
  Val  : 737
  Test : 820
Loading cached RAW features from disk...
Loaded RAW:
  X_raw shape: (7442, 108, 216)
  y shape    : (7442,)
RAW feature shape (N, T, F): (7442, 108, 216)
RAW splits:
  Train: (5885, 108, 216) (5885,)
  Val  : (737, 108, 216) (737,)
  Test : (820, 108, 216) (820,)
Scaled splits:
  Train: (5885, 108, 216)
  Val  : (737, 108, 216)
  Test : (820, 108, 216)


None
Epoch 1/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.2934 - loss: 2.0218
Epoch 1: val_accuracy improved from -inf to 0.37042, saving model to /content/cremad_features/best_ensemble.h5




[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 45ms/step - accuracy: 0.2939 - loss: 2.0205 - val_accuracy: 0.3704 - val_loss: 1.6665 - learning_rate: 0.0010
Epoch 2/100
[1m90/92[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.4253 - loss: 1.6308
Epoch 2: val_accuracy improved from 0.37042 to 0.43691, saving model to /content/cremad_features/best_ensemble.h5




[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.4256 - loss: 1.6301 - val_accuracy: 0.4369 - val_loss: 1.4966 - learning_rate: 0.0010
Epoch 3/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.4620 - loss: 1.5145
Epoch 3: val_accuracy improved from 0.43691 to 0.48982, saving model to /content/cremad_features/best_ensemble.h5




[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.4620 - loss: 1.5146 - val_accuracy: 0.4898 - val_loss: 1.4494 - learning_rate: 0.0010
Epoch 4/100
[1m90/92[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - accuracy: 0.4862 - loss: 1.4616
Epoch 4: val_accuracy did not improve from 0.48982
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.4864 - loss: 1.4611 - val_accuracy: 0.4830 - val_loss: 1.4375 - learning_rate: 0.0010
Epoch 5/100
[1m90/92[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - accuracy: 0.5313 - loss: 1.3724
Epoch 5: val_accuracy did not improve from 0.48982
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.5312 - loss: 1.3725 - val_accuracy: 0.4885 - val_loss: 1.4696 - learning_rate: 0.0010
Epoch 6/100
[1m91/92[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[



[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.5936 - loss: 1.2262 - val_accuracy: 0.5007 - val_loss: 1.4312 - learning_rate: 5.0000e-04
Epoch 9/100
[1m91/92[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 23ms/step - accuracy: 0.6090 - loss: 1.1937
Epoch 9: val_accuracy did not improve from 0.50068
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.6090 - loss: 1.1935 - val_accuracy: 0.4790 - val_loss: 1.4579 - learning_rate: 5.0000e-04
Epoch 10/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.6148 - loss: 1.1756
Epoch 10: val_accuracy did not improve from 0.50068
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step - accuracy: 0.6149 - loss: 1.1754 - val_accuracy: 0.5007 - val_loss: 1.4567 - learning_rate: 5.0000e-04
Epoch 11/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━



[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - accuracy: 0.6792 - loss: 1.0348 - val_accuracy: 0.5020 - val_loss: 1.4961 - learning_rate: 1.2500e-04
Epoch 16/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.6945 - loss: 1.0087
Epoch 16: val_accuracy did not improve from 0.50204
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step - accuracy: 0.6945 - loss: 1.0085 - val_accuracy: 0.5007 - val_loss: 1.5027 - learning_rate: 1.2500e-04
Epoch 17/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.6917 - loss: 0.9933
Epoch 17: val_accuracy improved from 0.50204 to 0.50611, saving model to /content/cremad_features/best_ensemble.h5




[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.6917 - loss: 0.9932 - val_accuracy: 0.5061 - val_loss: 1.5028 - learning_rate: 1.2500e-04
Epoch 18/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.6979 - loss: 0.9910
Epoch 18: val_accuracy improved from 0.50611 to 0.50746, saving model to /content/cremad_features/best_ensemble.h5




[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.6979 - loss: 0.9909 - val_accuracy: 0.5075 - val_loss: 1.5039 - learning_rate: 1.2500e-04
Epoch 19/100
[1m91/92[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 32ms/step - accuracy: 0.6999 - loss: 0.9719
Epoch 19: val_accuracy improved from 0.50746 to 0.51560, saving model to /content/cremad_features/best_ensemble.h5




[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 35ms/step - accuracy: 0.7001 - loss: 0.9717 - val_accuracy: 0.5156 - val_loss: 1.5116 - learning_rate: 1.2500e-04
Epoch 20/100
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.7086 - loss: 0.9674
Epoch 20: val_accuracy improved from 0.51560 to 0.52239, saving model to /content/cremad_features/best_ensemble.h5




[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step - accuracy: 0.7087 - loss: 0.9673 - val_accuracy: 0.5224 - val_loss: 1.5125 - learning_rate: 1.2500e-04
Epoch 21/100
[1m91/92[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 23ms/step - accuracy: 0.7113 - loss: 0.9538
Epoch 21: val_accuracy did not improve from 0.52239
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.7113 - loss: 0.9536 - val_accuracy: 0.5007 - val_loss: 1.5302 - learning_rate: 1.2500e-04
Epoch 22/100
[1m91/92[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 23ms/step - accuracy: 0.7235 - loss: 0.9367
Epoch 22: val_accuracy did not improve from 0.52239
[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.7235 - loss: 0.9366 - val_accuracy: 0.5075 - val_loss: 1.5230 - learning_rate: 1.2500e-04
Epoch 23/100
[1m91/92[0m [32m━━━━━━━━━━━━━━