Google conformer - Conformer: Convolution-augmented Transformer for Speech Recognition
Apple - IMPROVING VISION-INSPIRED KEYWORD SPOTTING USING DYNAMIC MODULE
SKIPPING IN STREAMING CONFORMER ENCODER

In [194]:
import os
import time
import psutil
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from model import Conformer
from model import EmotionClassifier
from model import GateLoss

In [195]:
use_cpu = False

if torch.cuda.is_available() and not use_cpu:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Device:", device)

Device: cuda


In [196]:
# Parameters

num_labels = 7
learning_rate = 0.001 
batch_size_train = 16
batch_size_inference = 1
blocks = 16 # conformer blocks
dropout = 0.1 # effects the dropout all over the model
inputdim = 40 # based on filter banks from the mfbs
modeldim = 40 # hidden number of features the model works with
subsamplereduction = 4 # reduces the time dimension at the start of the model by this factor
feedforwardexpansion = 4 # increases the hidden dimension in the feed forward modules
feedforwardweight = 0.5 # weight to ff modules
pointwiseexpansion = 2 # don't alter, brought back by glu
kernel_size = 31 # needs to be odd since we are using 'same' padding
attentionheads = 4 # feel free to change
temperature = 0.2 # if temp is low, the probabilities take more extreme values. 
gatethreshold = 0.5 # if Prob of execute > gatethreshold, execute
gateregularizer = 0.2 # loss function parameter on gate_loss
epochs = 10
gate_activation = 0.1 # Determines when to turn gates on during training epochs

In [197]:
label_to_int = {
    "Anger": 0,
    "Disgust": 1,
    "Fear": 2,
    "Happy": 3,
    "Sad": 4,
    "Neutral": 5,
    "Background": 6,
}

def load_data():
    train_features = np.load("data/train_features.npy")
    train_labels = np.load("data/train_labels.npy")
    testseen_features = np.load("data/testseen_features.npy")
    testseen_labels = np.load("data/testseen_labels.npy")
    testunseen_features = np.load("data/testunseen_features.npy")
    testunseen_labels = np.load("data/testunseen_labels.npy")

    # convert labels to integers so torch can use them
    train_labels = np.array([label_to_int[label] for label in train_labels])
    testseen_labels = np.array([label_to_int[label] for label in testseen_labels])
    testunseen_labels = np.array([label_to_int[label] for label in testunseen_labels])

    train_features_tensor = torch.tensor(train_features, dtype=torch.float32)
    train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)
    testseen_features_tensor = torch.tensor(testseen_features, dtype=torch.float32)
    testseen_labels_tensor = torch.tensor(testseen_labels, dtype=torch.long)
    testunseen_features_tensor = torch.tensor(testunseen_features, dtype=torch.float32)
    testunseen_labels_tensor = torch.tensor(testunseen_labels, dtype=torch.long)

    train_dataset = TensorDataset(train_features_tensor, train_labels_tensor)
    testseen_dataset = TensorDataset(testseen_features_tensor, testseen_labels_tensor)
    testunseen_dataset = TensorDataset(testunseen_features_tensor, testunseen_labels_tensor)

    train_loader = DataLoader(train_dataset, batch_size=batch_size_train, shuffle=True)
    train_loader_test = DataLoader(train_dataset, batch_size=batch_size_inference, shuffle=False) # For Batch Size 1
    testseen_loader = DataLoader(testseen_dataset, batch_size=batch_size_inference, shuffle=False)
    testunseen_loader = DataLoader(testunseen_dataset, batch_size=batch_size_inference, shuffle=False)
    return train_loader, train_loader_test, testseen_loader, testunseen_loader

In [198]:
def initialize_model(gates_on):
    conformer = Conformer(
        inputdim,
        modeldim,
        subsamplereduction,
        feedforwardexpansion,
        feedforwardweight,
        pointwiseexpansion,
        kernel_size,
        attentionheads,
        blocks,
        dropout,
        temperature,
        gatethreshold,
        gates_on=gates_on
    )
    model = EmotionClassifier(conformer, num_labels, modeldim)
    criterion = GateLoss(gateregularizer)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    return model, criterion, optimizer

In [199]:
def train_model(model, criterion, optimizer, train_loader, epochs, gate_activation):
    model.train()
    gates_on = False
    gate_activation_epoch = int(epochs * gate_activation)
    max_mem_usage = 0.0
    # for each parameter option in grid, run epochs on that option
    for epoch in range(epochs):
        start_time = time.time()
        if epoch == gate_activation_epoch:
            model.conformer.enable_gates()
            gates_on = True
            # freeze every parameter that isn't part of the gate module
        current_loss = 0.0
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            model.conformer.reset_gate_values()
            outputs = model(inputs)
            loss = criterion(outputs, labels, model.conformer.gate_values)
            loss.backward()
            optimizer.step()
            current_loss += loss.item()
            mem_usage = psutil.virtual_memory().used / 1e9  # in GB
            if mem_usage > max_mem_usage:
                max_mem_usage = mem_usage
        epoch_time = time.time() - start_time
        print(
            f"Epoch {epoch + 1} Gates {'On' if gates_on else 'Off'}, Loss: {(current_loss / len(train_loader)):.2f}, Time: {epoch_time:.2f} seconds, Memory Usage: {max_mem_usage:.2f} GB"
        )

In [200]:
skips = {
    0: 0,
    1: 0,
    2: 0,
    3: 0,
    4: 0,
    5: 0,
    6: 0,
}

module_count = {
    0: 0,
    1: 0,
    2: 0,
    3: 0,
    4: 0,
    5: 0,
    6: 0,
}

labels_actual = {
    0: "Anger",
    1: "Disgust",
    2: "Fear",
    3: "Happy",
    4: "Sad",
    5: "Neutral",
    6: "Background",
}

def evaluate_model(model, loader, name):
    model.eval()
    correct = 0
    total = 0
    start_time = time.time()

    with torch.no_grad():
        for input, label in loader: # Batch size always 1
            model.conformer.reset_gate_values()
            input = input.to(device)
            label = label.to(device)
            output = model(input)
            _, predicted = torch.max(output, 1)
            total += label.size(0)
            correct += (predicted == label).sum().item()

            # calculate skips
            emotion_key = label.item()
            skips_current = torch.sum(model.conformer.gate_values <= gatethreshold).item()
            module_count_current = model.conformer.gate_values.numel()
            skips[emotion_key] += skips_current
            module_count[emotion_key] += module_count_current

    for i in range(num_labels):
        skip_percentage = skips[i] / module_count[i]
        print(f"{name} {labels_actual[i]} skip percentage: {skip_percentage:.2f}")
    accuracy = 100 * correct / total
    epoch_time = time.time() - start_time
    print(f"Accuracy on {name} Data: {accuracy}, Time: {epoch_time:.2f} seconds")

Need to add to evaluate - count number of modules skipped

In [201]:
def count_parameters(model, long):
    all_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total Parameters: {all_params}")
    if long:
        for name, module in model.named_modules():
            if isinstance(module, nn.Module):
                total_params = sum(p.numel() for p in module.parameters())
                trainable_params = sum(
                    p.numel() for p in module.parameters() if p.requires_grad
                )
                print(
                    f"{module.__class__.__name__} {name} - Total Parameters: {total_params}, Trainable Parameters: {trainable_params}"
                )

In [202]:
def save_weights(model):
    path = "gated_conformer_model_weights.pth"
    torch.save(model.state_dict(), path)
    print(f"Model weights saved to {path}")

In [203]:
def load_weights(model):
    path = "gated_conformer_model_weights.pth"
    model.load_state_dict(torch.load(path))
    print(f"Model weights loaded from {path}")

### Testing:

In [204]:
# train_loader, testseen_loader, testunseen_loader = load_data()
# model, criterion, optimizer = initialize_model(gates_on=True)
# count_parameters(model, long=False)
# with torch.no_grad():
#     for inputs, labels in train_loader:
#         outputs = model(inputs)
#         loss = criterion(outputs, labels, model.conformer.gate_values)

### Train model:

What we are doing right now is: epochs with gates activated at certain epoch num

Things we can add: 
- grid search - temperature [0.1, 0.5, 1, 5], gateactivation [0, 0.25, 0.5, 0.75]
- for each epoch, kfolds
- freeze parameters
- right now we are only getting accuracy, can get the full confusion matrix

In [205]:
train_loader, train_loader_test, testseen_loader, testunseen_loader = load_data()
model, criterion, optimizer = initialize_model(gates_on=False) # gates are enabled during training based on gate_activation
model = model.to(device)
criterion = criterion.to(device)
count_parameters(model, long=False)
train_model(model, criterion, optimizer, train_loader, epochs, gate_activation)
save_weights(model)
evaluate_model(model, train_loader_test, "Train")
evaluate_model(model, testseen_loader, "Test Seen")
evaluate_model(model, testunseen_loader, "Test Uneen")

Total Parameters: 661900


KeyboardInterrupt: 