# Tutorial: Replacing nn.Linear with NdLinear in PyTorch

This tutorial demonstrates how to implement a custom NdLinear layer to replace nn.Linear for 2D tensor inputs. We'll compare performance between a traditional MLP and an NdLinear-based MLP on a speech emotion recognition task using the RAVDESS dataset.

### Setup and Hyperparameters

In [21]:
import os, glob, random
import numpy as np
import librosa
import soundfile
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Set seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

# Hyperparameters
EPOCHS = 100
BATCH_SIZE = 256
LEARNING_RATE = 1e-2
DROPOUT_RATE = 0.3
N_SPLITS = 5

### Data Loading & Feature Extraction

For this tutorial, we’ll use the RAVDESS dataset. It is the Ryerson Audio-Visual Database of Emotional Speech and Song dataset, and is free to download. This dataset has 7356 files rated by 247 individuals 10 times on emotional validity, intensity, and genuineness. The entire dataset is 24.8GB from 24 actors, thanks to Data Flair, they lowered the sample rate on all the files: https://data-flair.training/blogs/python-mini-project-speech-emotion-recognition/

You can download the audio files here: https://drive.google.com/file/d/1wWsrN2Ep7x6lWqOXfr4rpKGYrJhWc8z7/view


In [22]:
def extract_feature(file_name, mfcc=True, chroma=True, mel=True):
    """
    Extract 180-dimensional feature: 40 MFCC, 12 Chroma, 128 Mel.
    """
    with soundfile.SoundFile(file_name) as sf:
        X = sf.read(dtype="float32")
        sr = sf.samplerate
        stft = np.abs(librosa.stft(X)) if chroma else None
        result = []
        if mfcc:
            result.append(np.mean(librosa.feature.mfcc(y=X, sr=sr, n_mfcc=40).T, axis=0))
        if chroma:
            result.append(np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0))
        if mel:
            result.append(np.mean(librosa.feature.melspectrogram(y=X, sr=sr).T, axis=0))
        return np.hstack(result)  # shape = (180,)

emotions = {
    '01':'neutral', '02':'calm', '03':'happy', '04':'sad',
    '05':'angry',   '06':'fearful', '07':'disgust', '08':'surprised'
}
observed = ['calm', 'happy', 'fearful', 'disgust']

def load_data(path):
    """
    Load .wav files from the RAVDESS dataset folder structure.
    Returns: (X, y) as NumPy arrays
    """
    X, y = [], []
    for file in glob.glob(os.path.join(path, "Actor_*/*.wav")):
        base = os.path.basename(file)
        # The third position after splitting by '-' is the emotion code
        emotion_code = base.split("-")[2]
        emotion = emotions[emotion_code]
        if emotion in observed:
            feats = extract_feature(file, mfcc=True, chroma=True, mel=True)
            X.append(feats)
            y.append(emotion)
    return np.array(X), np.array(y)

### Implementing the NdLinear Layer

In [23]:
class NdLinear(nn.Module):
    """
    Factorized 2D linear layer:
    input shape (B, D1, D2) => output shape (B, H1, H2).
    """
    def __init__(self, in_shape, out_shape):
        super().__init__()
        D1, D2 = in_shape
        H1, H2 = out_shape
        self.W1 = nn.Parameter(torch.randn(D1, H1) * 0.01)
        self.W2 = nn.Parameter(torch.randn(D2, H2) * 0.01)
        self.b1 = nn.Parameter(torch.zeros(H1))
        self.b2 = nn.Parameter(torch.zeros(H2))

    def forward(self, x):
        # x: (B, D1, D2)
        B, D1, D2 = x.shape
        # 1) transform D1 dimension
        x = x.permute(0, 2, 1).reshape(B * D2, D1)   # => (B*D2, D1)
        x = x @ self.W1 + self.b1                    # => (B*D2, H1)
        x = x.reshape(B, D2, -1).permute(0, 2, 1)    # => (B, H1, D2)
        # 2) transform D2 dimension
        x = x.reshape(B * self.W1.shape[1], D2)      # => (B*H1, D2)
        x = x @ self.W2 + self.b2                    # => (B*H1, H2)
        return x.reshape(B, self.W1.shape[1], self.W2.shape[1])

### Reshaping Audio Features for NdLinear

In [24]:
def reshape_and_pad_features(x, target_cols=64):
    """
    Reshape the (180,) vector into (3,64) by:
      - first 40 => MFCC
      - next 12 => Chroma
      - last 128 => Mel
    Each group is zero-padded/truncated to length=64, then stacked => shape (3,64).
    """
    mfcc = x[0:40]
    chroma = x[40:52]
    mel = x[52:]
    mfcc_padded = np.pad(mfcc, (0, target_cols - len(mfcc)), mode='constant')
    chroma_padded = np.pad(chroma, (0, target_cols - len(chroma)), mode='constant')
    if len(mel) < target_cols:
        mel_padded = np.pad(mel, (0, target_cols - len(mel)), mode='constant')
    else:
        mel_padded = mel[:target_cols]
    return np.stack([mfcc_padded, chroma_padded, mel_padded], axis=0)  # (3,64)

### Model Architectures

In [25]:
class AudioMLP(nn.Module):
    """
    Traditional MLP that takes a flat 180-dim input => hidden=300 => out_dim
    """
    def __init__(self, input_dim, hidden_dim, out_dim, dropout_rate=0.3):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_dim, out_dim)

    def forward(self, x):
        # x: (B, 180)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

class AudioNdMLP(nn.Module):
    """
    Single-layer NdLinear-based MLP:
      (3,64) -> NdLinear(out_shape=(10,30)) => Flatten => FC => out_dim
    """
    def __init__(self, out_dim, dropout_rate=0.3):
        super().__init__()
        self.nd = NdLinear(in_shape=(3, 64), out_shape=(10, 30))
        self.fc = nn.Linear(300, out_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        # x: (B,3,64)
        x = self.nd(x)
        x = F.relu(x)
        x = x.view(x.size(0), -1)  # => (B,300)
        x = self.dropout(x)
        x = self.fc(x)
        return x

### Training and Evaluation

In [26]:
def train_one_epoch(model, optimizer, loss_fn, x_train, y_train, batch_size):
    model.train()
    permutation = torch.randperm(x_train.size(0))
    epoch_loss = 0.0
    for i in range(0, x_train.size(0), batch_size):
        indices = permutation[i:i+batch_size]
        batch_x = x_train[indices]
        batch_y = y_train[indices]
        optimizer.zero_grad()
        logits = model(batch_x)
        loss = loss_fn(logits, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss

def evaluate(model, x_test, y_test):
    model.eval()
    with torch.no_grad():
        logits = model(x_test)
        preds = logits.argmax(dim=1)
    acc = accuracy_score(y_test.cpu().numpy(), preds.cpu().numpy())
    return acc

### K-Fold Cross Validation

In [27]:
from sklearn.model_selection import StratifiedKFold

def cross_validate_models(X_raw, y_enc, n_splits=5, epochs=200, batch_size=64):
    """
    Perform cross-validation on both:
      1) Traditional MLP (flat input)
      2) NdLinear-based MLP (single NdLinear layer)
    Returns average accuracies across folds.
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    
    acc_trad_list = []
    acc_nd_list = []

    for fold_idx, (train_idx, test_idx) in enumerate(skf.split(X_raw, y_enc)):
        # Split data for this fold
        X_train, X_test = X_raw[train_idx], X_raw[test_idx]
        y_train, y_test = y_enc[train_idx], y_enc[test_idx]
        
        # 6.1) Traditional MLP - FLAT input
        x_train_t = torch.tensor(X_train, dtype=torch.float32)
        x_test_t = torch.tensor(X_test, dtype=torch.float32)
        y_train_t = torch.tensor(y_train, dtype=torch.long)
        y_test_t = torch.tensor(y_test, dtype=torch.long)
        
        model_trad = AudioMLP(input_dim=180, hidden_dim=300, out_dim=len(np.unique(y_enc)), dropout_rate=DROPOUT_RATE)
        optimizer_trad = torch.optim.Adam(model_trad.parameters(), lr=LEARNING_RATE)
        loss_fn_trad = nn.CrossEntropyLoss()

        for ep in range(epochs):
            train_one_epoch(model_trad, optimizer_trad, loss_fn_trad, x_train_t, y_train_t, batch_size)

        acc_trad = evaluate(model_trad, x_test_t, y_test_t)
        
        # 6.2) NdLinear-based MLP - reshape input to (3,64)
        X_train_struct = np.array([reshape_and_pad_features(x) for x in X_train])
        X_test_struct = np.array([reshape_and_pad_features(x) for x in X_test])

        x_train_nd = torch.tensor(X_train_struct, dtype=torch.float32)
        x_test_nd = torch.tensor(X_test_struct, dtype=torch.float32)

        model_nd = AudioNdMLP(out_dim=len(np.unique(y_enc)), dropout_rate=DROPOUT_RATE)
        optimizer_nd = torch.optim.Adam(model_nd.parameters(), lr=LEARNING_RATE)
        loss_fn_nd = nn.CrossEntropyLoss()

        for ep in range(epochs):
            train_one_epoch(model_nd, optimizer_nd, loss_fn_nd, x_train_nd, y_train_t, batch_size)
        
        acc_nd = evaluate(model_nd, x_test_nd, y_test_t)

        acc_trad_list.append(acc_trad)
        acc_nd_list.append(acc_nd)

        print(f"Fold {fold_idx+1}/{n_splits} => Traditional MLP: {acc_trad:.2%}, NdMLP: {acc_nd:.2%}")

    avg_trad = np.mean(acc_trad_list)
    avg_nd = np.mean(acc_nd_list)
    print("===================================================")
    print(f"Average Traditional MLP Accuracy: {avg_trad:.2%}")
    print(f"Average NdLinear MLP Accuracy: {avg_nd:.2%}")
    return avg_trad, avg_nd

### Main Script

In [28]:
def main():
    # Path to your RAVDESS folder
    data_path = "/path/to/speech-emotion-recognition-ravdess-data"

    # Load data & encode labels
    X_raw, y_raw = load_data(data_path)
    print("Loaded data shape:", X_raw.shape)
    le = LabelEncoder()
    y_enc = le.fit_transform(y_raw)

    # Perform K-Fold Cross Validation
    cross_validate_models(
        X_raw, y_enc, 
        n_splits=N_SPLITS, 
        epochs=EPOCHS, 
        batch_size=BATCH_SIZE
    )

if __name__ == "__main__":
    main()

Loaded data shape: (768, 180)
Fold 1/5 => Traditional MLP: 65.58%, NdMLP: 70.13%
Fold 2/5 => Traditional MLP: 64.94%, NdMLP: 69.48%
Fold 3/5 => Traditional MLP: 71.43%, NdMLP: 73.38%
Fold 4/5 => Traditional MLP: 64.71%, NdMLP: 77.12%
Fold 5/5 => Traditional MLP: 64.05%, NdMLP: 73.86%
Average Traditional MLP Accuracy: 66.14%
Average NdLinear MLP Accuracy: 72.79%


### Summary

We implemented a custom NdLinear layer as a structured alternative to nn.Linear.

We validated its performance via 5-fold cross-validation.

The NdLinear model can outperform traditional MLPs when the structure of the input is meaningful.

