# SincNet EmoDB: 10-Fold Cross-Validation Framework
This notebook implements a clean evaluation pipeline for SincNet-based models on the EmoDB dataset.
It compares:
1. **Basic SincNet**: The original architecture from Ravanelli et al. (2018).
2. **SincNet-LSTM-Attention**: An enhanced architecture for capturing temporal emotion dynamics.

Evaluation Protocol: **Speaker-Independent 10-Fold Cross-Validation**.

In [1]:
import os
import re
import math
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import soundfile as sf
import librosa
from sklearn.metrics import accuracy_score, classification_report

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:
!git clone https://github.com/mathispernin/sincnet_dlts.git
%cd sincnet_dlts

Cloning into 'sincnet_dlts'...
remote: Enumerating objects: 544, done.[K
remote: Counting objects: 100% (544/544), done.[K
remote: Compressing objects: 100% (543/543), done.[K
remote: Total 544 (delta 0), reused 541 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (544/544), 40.63 MiB | 18.44 MiB/s, done.
/kaggle/working/sincnet_dlts


In [3]:
# --- CONFIGURATION ---
DATA_PATH = 'data/'  # Ensure your EmoDB .wav files are in this folder
NUM_EPOCHS = 50
BATCH_SIZE = 32
LEARNING_RATE = 0.001

# EmoDB specific constants
ALL_SPEAKERS = ['03', '08', '09', '10', '11', '12', '13', '14', '15', '16']
EMOTION_MAP = {
    'W': 0, 'L': 1, 'E': 2, 'A': 3, 'F': 4, 'T': 5, 'N': 6
}
EMOTION_NAMES = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Sadness', 'Neutral']

## Load Data

In [4]:
# --- AUGMENTATION FUNCTIONS ---
def augment_noise(data):
    """Inject white noise."""
    noise_amp = 0.005 * np.random.uniform() * np.amax(data)
    data = data + noise_amp * np.random.normal(size=data.shape[0])
    return data

def augment_pitch(data, sr=16000):
    """Pitch shift."""
    n_steps = np.random.uniform(-2, 2)
    return librosa.effects.pitch_shift(y=data, sr=sr, n_steps=n_steps)

def augment_speed(data):
    """Speed change (fast/slow)."""
    speed_factor = np.random.uniform(0.9, 1.1)
    return librosa.effects.time_stretch(y=data, rate=speed_factor)

# --- FILE PARSING UTILS ---
def get_files_for_speakers(speaker_list, data_path):
    if not os.path.exists(data_path):
        print(f"ERROR: Path '{data_path}' does not exist!")
        return []
        
    all_files = [f for f in os.listdir(data_path) if f.endswith('.wav')]
    target_files = []
    # Regex to match EmoDB format (e.g., 03a01Fa.wav)
    regex = r'(\d{2})([a-z]\d{2})([A-Z])([a-z]?)\.wav'
    
    for f in all_files:
        m = re.match(regex, f)
        if m:
            spk = m.group(1)
            if spk in speaker_list:
                target_files.append(f)
    return target_files

In [5]:
class EmoDBDataset(Dataset):
    def __init__(self, data_path, file_list, max_len=48000, augment=False):
        self.data_path = data_path
        self.files = file_list
        self.max_len = max_len
        self.augment = augment

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file_name = self.files[idx]
        file_path = os.path.join(self.data_path, file_name)
        
        # Load audio
        wav, sr = sf.read(file_path)
        wav = wav.astype(np.float32) 
        
        # --- Augmentation Pipeline ---
        if self.augment:
            aug_choice = np.random.randint(0, 4)
            try:
                if aug_choice == 0:
                    wav = augment_noise(wav)
                elif aug_choice == 1:
                    wav = augment_pitch(wav, sr)
                elif aug_choice == 2:
                    wav = augment_speed(wav)
                # Choice 3 is "no augmentation"
            except Exception:
                pass

        # --- Padding / Truncating ---
        if len(wav) < self.max_len:
            pad = self.max_len - len(wav)
            wav = np.pad(wav, (0, pad), 'constant')
        else:
            diff = len(wav) - self.max_len
            if self.augment and diff > 0:
                # Random crop for training
                start = np.random.randint(0, diff)
                wav = wav[start : start + self.max_len]
            else:
                # Center crop/Fixed crop for validation/testing
                wav = wav[:self.max_len]

        # --- Label Extraction ---
        m = re.match(r'(\d{2})([a-z]\d{2})([A-Z])([a-z]?)\.wav', file_name)
        if m:
            emotion_code = m.group(3)
            label = EMOTION_MAP[emotion_code]
        else:
            label = 0 # Should not happen with correct filtering

        return torch.FloatTensor(wav).unsqueeze(0), torch.tensor(label)

## Models Architectures

In [6]:
# --- 1. SincConv Layer ---
class SincConv_fast(nn.Module):
    @staticmethod
    def to_mel(hz):
        return 2595 * np.log10(1 + hz / 700)

    @staticmethod
    def to_hz(mel):
        return 700 * (10 ** (mel / 2595) - 1)

    def __init__(self, out_channels, kernel_size, sample_rate=16000, in_channels=1,
                 stride=1, padding=0, dilation=1, min_low_hz=50, min_band_hz=50):
        super(SincConv_fast,self).__init__()

        if in_channels != 1:
            raise ValueError("SincConv only supports one input channel.")

        self.out_channels = out_channels
        self.kernel_size = kernel_size
        
        # Force odd kernel size
        if kernel_size % 2 == 0:
            self.kernel_size = self.kernel_size + 1

        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self.sample_rate = sample_rate
        self.min_low_hz = min_low_hz
        self.min_band_hz = min_band_hz

        # Initialize filterbanks (Mel-scale)
        low_hz = 30
        high_hz = self.sample_rate / 2 - (self.min_low_hz + self.min_band_hz)

        mel = np.linspace(self.to_mel(low_hz), self.to_mel(high_hz), self.out_channels + 1)
        hz = self.to_hz(mel)

        self.low_hz_ = nn.Parameter(torch.Tensor(hz[:-1]).view(-1, 1))
        self.band_hz_ = nn.Parameter(torch.Tensor(np.diff(hz)).view(-1, 1))

        # Hamming window
        self.window_ = torch.hamming_window(self.kernel_size, periodic=False)
        n = (self.kernel_size - 1) / 2.0
        self.n_ = 2 * math.pi * torch.arange(-n, 0).view(1, -1) / self.sample_rate

    def forward(self, waveforms):
        self.n_ = self.n_.to(waveforms.device)
        self.window_ = self.window_.to(waveforms.device)

        low = self.min_low_hz + torch.abs(self.low_hz_)
        high = torch.clamp(low + self.min_band_hz + torch.abs(self.band_hz_), self.min_low_hz, self.sample_rate/2)
        band = (high - low)[:, 0]

        f_times_t_low = torch.matmul(low, self.n_)
        f_times_t_high = torch.matmul(high, self.n_)

        band_pass_left = ((torch.sin(f_times_t_high) - torch.sin(f_times_t_low)) / (self.n_ / 2))
        band_pass_center = 2 * band.view(-1, 1)
        band_pass_right = torch.flip(band_pass_left, dims=[1])

        band_pass = torch.cat([band_pass_left, band_pass_center, band_pass_right], dim=1)
        band_pass = band_pass / (2 * band[:, None])

        self.filters = (band_pass * self.window_)

        return F.conv1d(waveforms, self.filters.view(self.out_channels, 1, self.kernel_size),
                        stride=self.stride, padding=self.padding, dilation=self.dilation,
                        bias=None, groups=1)

# --- 2. Attention Layer ---
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attention = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        scores = self.attention(x) 
        weights = F.softmax(scores, dim=1)
        context_vector = torch.sum(x * weights, dim=1)
        return context_vector

In [7]:
class SincNet_Basic(nn.Module):
    """
    Standard SincNet architecture: SincConv -> MaxPool -> CNNs -> MLP
    """
    def __init__(self, num_classes=7):
        super(SincNet_Basic, self).__init__()
        
        self.sinc_conv = SincConv_fast(out_channels=80, kernel_size=251, sample_rate=16000)
        self.pool = nn.MaxPool1d(3)
        self.bn0 = nn.BatchNorm1d(80)
        
        self.conv2 = nn.Conv1d(80, 60, kernel_size=5)
        self.bn2 = nn.BatchNorm1d(60)
        
        self.conv3 = nn.Conv1d(60, 60, kernel_size=5)
        self.bn3 = nn.BatchNorm1d(60)
        
        self.fc1 = nn.Linear(60, 256)
        self.bn_fc1 = nn.BatchNorm1d(256)
        self.fc2 = nn.Linear(256, num_classes)
        
    def forward(self, x):
        # x: (Batch, 1, Seq)
        x = self.sinc_conv(x)
        x = self.pool(F.leaky_relu(self.bn0(x)))
        
        x = self.pool(F.leaky_relu(self.bn2(self.conv2(x))))
        x = self.pool(F.leaky_relu(self.bn3(self.conv3(x))))
        
        x = x.mean(dim=2) # Global Averaging
        
        x = F.leaky_relu(self.bn_fc1(self.fc1(x)))
        x = self.fc2(x)
        return x

In [8]:
# --- 3. Full Model ---
class SincNetLSTM_Attention(nn.Module):
    def __init__(self, num_classes=7):
        super(SincNetLSTM_Attention, self).__init__()
        
        # Frontend: SincNet
        self.sinc_conv = SincConv_fast(out_channels=80, kernel_size=251, sample_rate=16000)
        self.bn0 = nn.BatchNorm1d(80)
        self.pool0 = nn.MaxPool1d(3)

        # Standard CNN
        self.conv1 = nn.Conv1d(80, 64, kernel_size=5, padding=2)
        self.bn1 = nn.BatchNorm1d(64)
        self.pool1 = nn.MaxPool1d(2)
        self.drop1 = nn.Dropout(0.3)

        self.conv2 = nn.Conv1d(64, 128, kernel_size=5, padding=2)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(2)
        self.drop2 = nn.Dropout(0.3)

        # Backend: LSTM
        self.lstm = nn.LSTM(input_size=128, hidden_size=128, num_layers=1, batch_first=True, bidirectional=True)
        self.drop_lstm = nn.Dropout(0.3)

        # Attention
        self.attention = Attention(hidden_dim=256)

        # Classifier
        self.fc1 = nn.Linear(256, 64)
        self.bn_fc1 = nn.BatchNorm1d(64)
        self.drop_fc = nn.Dropout(0.4)
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.sinc_conv(x)
        x = F.relu(self.bn0(x))
        x = self.pool0(x) 
        
        x = self.conv1(x)
        x = F.relu(self.bn1(x))
        x = self.pool1(x)
        x = self.drop1(x)
        
        x = self.conv2(x)
        x = F.relu(self.bn2(x))
        x = self.pool2(x)
        x = self.drop2(x)
        
        x = x.permute(0, 2, 1) # (Batch, Time, Feats)
        
        self.lstm.flatten_parameters()
        x, _ = self.lstm(x)
        x = self.drop_lstm(x)
        
        x = self.attention(x)
        
        x = self.fc1(x)
        x = F.relu(self.bn_fc1(x))
        x = self.drop_fc(x)
        x = self.fc2(x)
        return x

## Training

In [9]:
def train_one_fold(model_class, train_loader, val_loader, device, num_epochs=40):
    # Instantiate fresh model
    model = model_class(num_classes=7).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=4)
    criterion = nn.CrossEntropyLoss()
    
    best_acc = 0.0
    best_weights = copy.deepcopy(model.state_dict())
    
    for epoch in range(num_epochs):
        model.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            loss = criterion(model(inputs), labels)
            loss.backward()
            optimizer.step()
            
        model.eval()
        correct, total, val_loss = 0, 0, 0.0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                val_loss += criterion(outputs, labels).item() * inputs.size(0)
                _, pred = torch.max(outputs, 1)
                correct += (pred == labels).sum().item()
                total += labels.size(0)
        
        if total > 0:
            acc = correct / total
            scheduler.step(val_loss / total)
            if acc > best_acc:
                best_acc = acc
                best_weights = copy.deepcopy(model.state_dict())
                
    return best_weights, best_acc

In [14]:
def run_cross_validation(model_class, model_name="Model", epochs=40):
    print(f"\n[START] Cross-Validation for: {model_name}")
    print("="*50)
    
    fold_accuracies = []
    
    for i, test_speaker in enumerate(ALL_SPEAKERS):
        # Split Strategy
        val_idx = (i + 1) % len(ALL_SPEAKERS)
        val_speaker = ALL_SPEAKERS[val_idx]
        train_speakers = [s for s in ALL_SPEAKERS if s != test_speaker and s != val_speaker]
        
        # Get Files
        train_files = get_files_for_speakers(train_speakers, DATA_PATH)
        val_files   = get_files_for_speakers([val_speaker], DATA_PATH)
        test_files  = get_files_for_speakers([test_speaker], DATA_PATH)
        
        if not train_files or not test_files: continue
            
        # Create Loaders
        train_dl = DataLoader(EmoDBDataset(DATA_PATH, train_files, augment=True), 
                              batch_size=32, shuffle=True, drop_last=True)
        val_dl   = DataLoader(EmoDBDataset(DATA_PATH, val_files), batch_size=32)
        test_dl  = DataLoader(EmoDBDataset(DATA_PATH, test_files), batch_size=32)
        
        # Train
        print(f"Fold {i+1}/{len(ALL_SPEAKERS)} (Test: {test_speaker}) ... ", end="")
        best_weights, val_acc = train_one_fold(model_class, train_dl, val_dl, device, epochs)
        
        # Test
        model = model_class(num_classes=7).to(device)
        model.load_state_dict(best_weights)
        model.eval()
        
        preds, truths = [], []
        with torch.no_grad():
            for inputs, labels in test_dl:
                inputs = inputs.to(device)
                preds.extend(torch.max(model(inputs), 1)[1].cpu().numpy())
                truths.extend(labels.numpy())
        
        acc = accuracy_score(truths, preds)
        fold_accuracies.append(acc)
        print(f"Test Acc: {acc*100:.2f}% (Val: {val_acc*100:.2f}%)")
        
    avg = np.mean(fold_accuracies)
    std = np.std(fold_accuracies)
    print("="*50)
    print(f"[{model_name}] Final Accuracy: {avg*100:.2f}% (+/- {std*100:.2f}%)")
    print("="*50)
    return fold_accuracies

In [None]:
# 1. Train Basic SincNet
acc_basic = run_cross_validation(SincNet_Basic, "Basic SincNet", epochs=40)

# 2. Train AdvaSincNet (LSTM+Attention)
acc_advanced = run_cross_validation(SincNetLSTM_Attention, "SincNet+LSTM+Attn", epochs=40)


[START] Cross-Validation for: Basic SincNet
Fold 1/10 (Test: 03) ... Test Acc: 46.94% (Val: 77.59%)
Fold 2/10 (Test: 08) ... Test Acc: 62.07% (Val: 55.81%)
Fold 3/10 (Test: 09) ... Test Acc: 27.91% (Val: 84.21%)
Fold 4/10 (Test: 10) ... Test Acc: 73.68% (Val: 58.18%)
Fold 5/10 (Test: 11) ... Test Acc: 54.55% (Val: 62.86%)
Fold 6/10 (Test: 12) ... Test Acc: 60.00% (Val: 67.21%)
Fold 7/10 (Test: 13) ... Test Acc: 50.82% (Val: 63.77%)
Fold 8/10 (Test: 14) ... Test Acc: 47.83% (Val: 58.93%)
Fold 9/10 (Test: 15) ... Test Acc: 51.79% (Val: 60.56%)
Fold 10/10 (Test: 16) ... Test Acc: 52.11% (Val: 59.18%)
[Basic SincNet] Final Accuracy: 52.77% (+/- 11.23%)

[START] Cross-Validation for: SincNet+LSTM+Attn
Fold 1/10 (Test: 03) ... Test Acc: 46.94% (Val: 67.24%)
Fold 2/10 (Test: 08) ... Test Acc: 58.62% (Val: 48.84%)
Fold 3/10 (Test: 09) ... Test Acc: 32.56% (Val: 84.21%)
Fold 4/10 (Test: 10) ... 