# KEYWORD SPOTTING

# Installations

In [None]:
%%capture
!git clone https://github.com/speechbrain/speechbrain.git
%cd speechbrain
!pip install -r requirements.txt
!pip install .
%cd ..

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Prepare dataset

In [None]:
%%file prepare_GSC.py

"""
Data preparation for Google Speech Commands v0.02.

Download: http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz

Author
------
David Raby-Pepin 2021

"""

import os
from os import walk
import glob
import shutil
import logging
import torch
import re
import hashlib
import copy
import numpy as np
from speechbrain.utils.data_utils import download_file
from speechbrain.dataio.dataio import read_audio

try:
    import pandas as pd
except ImportError:
    err_msg = (
        "The optional dependency pandas must be installed to run this recipe.\n"
    )
    err_msg += "Install using `pip install pandas`.\n"
    raise ImportError(err_msg)

logger = logging.getLogger(__name__)

GSC_URL = "http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz"

# List of all the words (i.e. classes) within the GSC v2 dataset
all_words = [
    "yes",
    "no",
    "up",
    "down",
    "left",
    "right",
    "on",
    "off",
    "stop",
    "go",
    "zero",
    "one",
    "two",
    "three",
    "four",
    "five",
    "six",
    "seven",
    "eight",
    "nine",
    "bed",
    "bird",
    "cat",
    "dog",
    "happy",
    "house",
    "marvin",
    "sheila",
    "tree",
    "wow",
    "backward",
    "forward",
    "follow",
    "learn",
    "visual",
]


def prepare_GSC(
    data_folder,
    save_folder,
    validation_percentage=10,
    testing_percentage=10,
    percentage_unknown=10,
    percentage_silence=10,
    words_wanted=[
        "yes",
        "no",
        "up",
        "down",
        "left",
        "right",
        "on",
        "off",
        "stop",
        "go",
    ],
    skip_prep=False,
):
    """
    Prepares the Google Speech Commands V2 dataset.

    Arguments
    ---------
    data_folder : str
        path to dataset. If not present, it will be downloaded here.
    save_folder: str
        folder where to store the data manifest files.
    validation_percentage: int
        How much of the data set to use for validation.
    testing_percentage: int
        How much of the data set to use for testing.
    percentage_unknown: int.
        How much data outside of the known (i.e wanted) words to preserve; relative to the total number of known words.
    percentage_silence: int
        How many silence samples to generate; relative to the total number of known words.
    words_wanted: list
        The list of commands to use from the dataset.
    skip_prep: bool
        If True, skip data preparation.

    Returns
    -------
    None

    Example
    -------
    >>> data_folder = '/path/to/GSC'
    >>> prepare_GSC(data_folder)
    """

    if skip_prep:
        return

    # If the data folders do not exist, we need to extract the data
    if not os.path.isdir(os.path.join(data_folder, "train-synth")):
        # Check for zip file and download if it doesn't exist
        tar_location = os.path.join(data_folder, "speech_commands_v0.02.tar.gz")
        if not os.path.exists(tar_location):
            download_file(GSC_URL, tar_location, unpack=True)
        else:
            logger.info("Extracting speech_commands_v0.02.tar.gz...")
            shutil.unpack_archive(tar_location, data_folder)

    # Define the words that we do not want to identify
    unknown_words = list(np.setdiff1d(all_words, words_wanted))

    # All metadata fields to appear within our dataset annotation files (i.e. train.csv, valid.csv, test.cvs)
    fields = {
        "ID": [],
        "duration": [],
        "start": [],
        "stop": [],
        "wav": [],
        "spk_id": [],
        "command": [],
        "transcript": [],
    }

    splits = {
        "train": copy.deepcopy(fields),
        "valid": copy.deepcopy(fields),
        "test": copy.deepcopy(fields),
    }

    num_known_samples_per_split = {"train": 0, "valid": 0, "test": 0}
    words_wanted_parsed = False
    commands = words_wanted + unknown_words
    for i, command in enumerate(commands):
        # logger.info("Preparing {}/{} commands...".format(i, len(commands)))

        # Indicate once all wanted words are parsed
        if i >= len(words_wanted) and not words_wanted_parsed:
            num_known_samples_total = np.sum(
                list(num_known_samples_per_split.values())
            )
            num_unknown_samples_total = 105829 - num_known_samples_total
            percentage_applied_to_unknown_samples = (
                percentage_unknown * num_known_samples_total
            ) / num_unknown_samples_total
            words_wanted_parsed = True

        # Read all files under a specific class (i.e. command)
        files = []
        for dirpath, dirnames, filenames in walk(
            os.path.join(data_folder, command)
        ):
            files.extend(filenames)
            break

        # Fill in all fields with metadata for each audio sample file under a specific class
        for filename in files:
            # Once all wanted words are parsed, only retain the required percentage of unknown words
            if (
                words_wanted_parsed
                and torch.rand(1)[0].tolist()
                > percentage_applied_to_unknown_samples / 100
            ):
                continue

            # select the required split (i.e. set) for the sample
            split = which_set(
                filename, validation_percentage, testing_percentage
            )

            splits[split]["ID"].append(
                command + "/" + re.sub(r".wav", "", filename)
            )

            # We know that all recordings are 1 second long (i.e.16000 frames). No need to compute the duration.
            splits[split]["duration"].append(1.0)
            splits[split]["start"].append(0)
            splits[split]["stop"].append(16000)

            splits[split]["wav"].append(
                os.path.join(data_folder, command, filename)
            )

            splits[split]["spk_id"].append(re.sub(r"_.*", "", filename))

            if command in words_wanted:
                splits[split]["command"].append(command)

                num_known_samples_per_split[split] += 1
            else:
                splits[split]["command"].append("unknown")

            splits[split]["transcript"].append(command)

    if percentage_silence > 0:
        generate_silence_data(
            num_known_samples_per_split,
            splits,
            data_folder,
            percentage_silence=percentage_silence,
        )

    for split in splits:
        new_filename = os.path.join(save_folder, split) + ".csv"
        new_df = pd.DataFrame(splits[split])
        new_df.to_csv(new_filename, index=False)


MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M


def which_set(filename, validation_percentage, testing_percentage):
    """Determines which data partition the file should belong to.

    We want to keep files in the same training, validation, or testing sets even
    if new ones are added over time. This makes it less likely that testing
    samples will accidentally be reused in training when long runs are restarted
    for example. To keep this stability, a hash of the filename is taken and used
    to determine which set it should belong to. This determination only depends on
    the name and the set proportions, so it won't change as other files are added.

    It's also useful to associate particular files as related (for example words
    spoken by the same person), so anything after '_nohash_' in a filename is
    ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
    'bobby_nohash_1.wav' are always in the same set, for example.

    Arguments
    ---------
    filename: path
        File path of the data sample.
    validation_percentage: int
        How much of the data set to use for validation.
    testing_percentage: int
        How much of the data set to use for testing.

    Returns
    -------
    result: str
        one of 'training', 'validation', or 'testing'.
    """
    base_name = os.path.basename(filename)
    # We want to ignore anything after '_nohash_' in the file name when
    # deciding which set to put a wav in, so the data set creator has a way of
    # grouping wavs that are close variations of each other.
    hash_name = re.sub(r"_nohash_.*$", "", base_name).encode("utf-8")
    # This looks a bit magical, but we need to decide whether this file should
    # go into the training, testing, or validation sets, and we want to keep
    # existing files in the same set even if more files are subsequently
    # added.
    # To do that, we need a stable way of deciding based on just the file name
    # itself, so we do a hash of that and then use that to generate a
    # probability value that we use to assign it.
    hash_name_hashed = hashlib.sha1(hash_name).hexdigest()
    percentage_hash = (
        int(hash_name_hashed, 16) % (MAX_NUM_WAVS_PER_CLASS + 1)
    ) * (100.0 / MAX_NUM_WAVS_PER_CLASS)
    if percentage_hash < validation_percentage:
        result = "valid"
    elif percentage_hash < (testing_percentage + validation_percentage):
        result = "test"
    else:
        result = "train"
    return result


def generate_silence_data(
    num_known_samples_per_split, splits, data_folder, percentage_silence=26
):
    """Generates silence samples.

    Arguments
    ---------
    num_known_samples_per_split: int
        Total number of samples of known words for each split (i.e. set).
    splits: str
        Training, validation and test sets.
    data_folder: str
        path to dataset.
    percentage_silence: int
        How many silence samples to generate; relative to the total number of known words.
    """
    for split in splits:
        num_silence_samples = int(
            (percentage_silence / 100.0) * num_known_samples_per_split[split]
        )

        # Fetch all background noise wav files used to generate silence samples
        search_path = os.path.join(data_folder, "_background_noise_", "*.wav")
        silence_paths = []
        for wav_path in glob.glob(search_path):
            silence_paths.append(wav_path)

        # Generate random silence samples
        # Assumes that the pytorch seed has been defined in the HyperPyYaml file
        num_silence_samples_per_path = int(
            num_silence_samples / len(silence_paths)
        )
        for silence_path in silence_paths:
            signal = read_audio(silence_path)
            random_starts = (
                (
                    torch.rand(num_silence_samples_per_path)
                    * (signal.shape[0] - 16001)
                )
                .type(torch.int)
                .tolist()
            )

            for i, random_start in enumerate(random_starts):
                splits[split]["ID"].append(
                    re.sub(
                        r".wav",
                        "/" + str(random_start) + "_" + str(i),
                        re.sub(r".+?(?=_background_noise_)", "", silence_path),
                    )
                )

                splits[split]["duration"].append(1.0)
                splits[split]["start"].append(random_start)
                splits[split]["stop"].append(random_start + 16000)
                splits[split]["wav"].append(silence_path)
                splits[split]["spk_id"].append(None)
                splits[split]["command"].append("silence")
                splits[split]["transcript"].append(None)


Writing prepare_GSC.py


# Code to generate plots

In [None]:
import matplotlib.pyplot as plt

def plot(train_loss, valid_loss, error_rate):
    """
    Plots two graphs side by side: one for train loss and validation loss vs epochs, and the other for error rate vs epochs.

    Parameters:
    train_loss (list: floats): The training loss values for each epoch.
    valid_loss (list: floats): The validation loss values for each epoch.
    error_rate (list: floats): The error rate values for each epoch.

    Returns:
    None
    """

    epochs = len(train_loss)

    # Convert error rate to percentage
    error_rate = [v * 100 for v in error_rate]

    # Plotting both graphs side by side
    plt.figure(figsize=(14, 5))

    # Plotting the first graph (train loss vs validation loss)
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_loss, marker='o', label='Train Loss')
    plt.plot(epochs, valid_loss, marker='o', label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Train Loss vs Validation Loss')
    plt.legend()

    # Plotting the second graph (error rate)
    plt.subplot(1, 2, 2)
    plt.plot(epochs, error_rate, marker='o', color='red')
    plt.xlabel('Epoch')
    plt.ylabel('Error Rate (%)')
    plt.title('Error Rate')

    # Adjust layout to prevent overlap
    plt.tight_layout()

    # Show the plot
    plt.show()

# Example usage
# plot([0.1,0.2,0.4], [0.2,0.3,0.5], [0.11, 0.432, 0.10])


# CONTINUOUS FEATURES: FBANKS

**Continuous features are real-valued features extracted from audio signals. Since they are continuous, they have fine-grained values/information of the downstream task and perform incredibly well, even with just a simple classifier.**

**Why Fbanks?
There are many reasons why Fbanks is used in Speech Recognition. They are simple and intuitive. They are robust enough to be used till this day. They downsample the audio data greatly for efficient processing. Most importantly, they mimic the humans ear's perception of audio, targetting the low frequencies and maximising germane information. Hence, it is a good feature extractor for keyword spotting.**

### hparams

In [None]:
%%file hparams_xvector_fbanks.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/xvect_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 20
batch_size: 32
lr: 0.001
lr_final: 0.0001

sample_rate: 16000
shuffle: True


# Feature parameters
n_mels: 40 #27
left_frames: 0
right_frames: 0
deltas: False

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
compute_features: !new:speechbrain.lobes.features.Fbank
    n_mels: !ref <n_mels>
    left_frames: !ref <left_frames>
    right_frames: !ref <right_frames>
    deltas: !ref <deltas>

embedding_model: !new:speechbrain.lobes.models.Xvector.Xvector
    in_channels: !ref <n_mels>
    activation: !name:torch.nn.LeakyReLU
    tdnn_blocks: 5
    tdnn_channels: [512, 512, 512, 512, 1500]
    tdnn_kernel_sizes: [5, 3, 3, 1, 1]
    tdnn_dilations: [1, 2, 3, 1, 1]
    lin_neurons: 512

classifier: !new:speechbrain.lobes.models.Xvector.Classifier
    input_shape: [null, null, 512]
    activation: !name:torch.nn.LeakyReLU
    lin_blocks: 1
    lin_neurons: 512
    out_neurons: !ref <out_n_neurons>

softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

modules:
    compute_features: !ref <compute_features>
    embedding_model: !ref <embedding_model>
    classifier: !ref <classifier>
    softmax: !ref <softmax>
    mean_var_norm: !ref <mean_var_norm>


# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

lr_annealing: !new:speechbrain.nnet.schedulers.LinearScheduler
    initial_value: !ref <lr>
    final_value: !ref <lr_final>
    epoch_count: !ref <number_of_epochs>

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        embedding_model: !ref <embedding_model>
        classifier: !ref <classifier>
        normalizer: !ref <mean_var_norm>
        counter: !ref <epoch_counter>

Overwriting hparams_xvector_fbanks.yaml


#### train

In [None]:
%%file train_fbanks.py

#!/usr/bin/python3
"""Recipe for training a classifier using the
Google Speech Commands v0.02 Dataset.

To run this recipe, use the following command:
> python train.py {hyperparameter_file}

Using your own hyperparameter file or one of the following:
    hyperparams/xvect.yaml (xvector system)

Author
    * Mirco Ravanelli 2020
    * Hwidong Na 2020
    * Nauman Dawalatabad 2020
    * Sarthak Yadav 2022
    Script adapted by David Raby-Pepin 2021
"""
import os
import sys
import torch
import torchaudio
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml

import speechbrain.nnet.CNN
from speechbrain.utils.distributed import run_on_main


class SpeakerBrain(sb.core.Brain):
    """Class for GSC training" """

    def compute_forward(self, batch, stage):
        """Computation pipeline based on a encoder + command classifier.
        Data augmentation and environmental corruption are optionally applied to the
        input speech.
        """
        batch = batch.to(self.device)
        wavs, lens = batch.sig

        # Add waveform augmentation if specified.
        if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
            wavs, lens = self.hparams.wav_augment(wavs, lens)

        if isinstance(
            self.modules.compute_features, speechbrain.lobes.features.Leaf
        ):
            # if leaf, first normalize the wavs before feeding them to leaf
            # no normalization is needed after LEAF
            feats = self.modules.mean_var_norm(wavs, lens)
            feats = self.modules.compute_features(feats)
        else:
            # Feature extraction and normalization
            feats = self.modules.compute_features(wavs)
            feats = self.modules.mean_var_norm(feats, lens)

        # Embeddings + classifier
        embeddings = self.modules.embedding_model(feats)
        outputs = self.modules.classifier(embeddings)

        # Ecapa model uses softmax outside of its classifier
        if "softmax" in self.modules.keys():
            outputs = self.modules.softmax(outputs)

        return outputs, lens

    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss using command-id as label."""
        predictions, lens = predictions
        uttid = batch.id
        command, _ = batch.command_encoded

        # Concatenate labels (due to data augmentation)
        if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
            command = self.hparams.wav_augment.replicate_labels(command)

        # compute the cost function
        loss = self.hparams.compute_cost(predictions, command, lens)
        # loss = sb.nnet.losses.nll_loss(predictions, command, lens)

        if hasattr(self.hparams.lr_annealing, "on_batch_end"):
            self.hparams.lr_annealing.on_batch_end(self.optimizer)

        if stage != sb.Stage.TRAIN:
            self.error_metrics.append(uttid, predictions, command, lens)

        return loss

    def on_stage_start(self, stage, epoch=None):
        """Gets called at the beginning of an epoch."""
        if stage != sb.Stage.TRAIN:
            self.error_metrics = self.hparams.error_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        """Gets called at the end of an epoch."""
        # Compute/store important stats
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["ErrorRate"] = self.error_metrics.summarize("average")

        # Perform end-of-iteration things, like annealing, logging, etc.
        if stage == sb.Stage.VALID:
            old_lr, new_lr = self.hparams.lr_annealing(epoch)
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)

            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch, "lr": old_lr},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"ErrorRate": stage_stats["ErrorRate"]},
                min_keys=["ErrorRate"],
            )

        # We also write statistics about test data to stdout and to the logfile.
        if stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                {"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stage_stats,
            )


def dataio_prep(hparams):
    "Creates the datasets and their data processing pipelines."

    data_folder = hparams["data_folder"]

    # 1. Declarations:
    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["train_annotation"],
        replacements={"data_root": data_folder},
    )

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["valid_annotation"],
        replacements={"data_root": data_folder},
    )

    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["test_annotation"],
        replacements={"data_root": data_folder},
    )

    datasets = [train_data, valid_data, test_data]
    label_encoder = sb.dataio.encoder.CategoricalEncoder()

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav", "start", "stop", "duration")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav, start, stop, duration):
        start = int(start)
        stop = int(stop)
        num_frames = stop - start
        sig, fs = torchaudio.load(
            wav, num_frames=num_frames, frame_offset=start
        )
        sig = sig.transpose(0, 1).squeeze(1)
        return sig

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("command")
    @sb.utils.data_pipeline.provides("command", "command_encoded")
    def label_pipeline(command):
        yield command
        command_encoded = label_encoder.encode_sequence_torch([command])
        yield command_encoded

    sb.dataio.dataset.add_dynamic_item(datasets, label_pipeline)

    # 3. Fit encoder:
    # Load or compute the label encoder (with multi-GPU DDP support)
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load_or_create(
        path=lab_enc_file,
        from_didatasets=[train_data],
        output_key="command",
    )

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets, ["id", "sig", "command_encoded"]
    )

    return train_data, valid_data, test_data, label_encoder


if __name__ == "__main__":
    # This flag enables the inbuilt cudnn auto-tuner
    torch.backends.cudnn.benchmark = True

    # CLI:
    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])

    # Initialize ddp (useful only for multi-GPU DDP training)
    sb.utils.distributed.ddp_init_group(run_opts)

    # Load hyperparameters file with command-line overrides
    with open(hparams_file) as fin:
        hparams = load_hyperpyyaml(fin, overrides)

    # Create experiment directory
    sb.core.create_experiment_directory(
        experiment_directory=hparams["output_folder"],
        hyperparams_to_save=hparams_file,
        overrides=overrides,
    )

    # Dataset prep (parsing GSC and annotation into csv files)
    from prepare_GSC import prepare_GSC

    # Known words for V2 12 and V2 35 sets
    if hparams["number_of_commands"] == 12:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
        ]
    elif hparams["number_of_commands"] == 35:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
            "zero",
            "one",
            "two",
            "three",
            "four",
            "five",
            "six",
            "seven",
            "eight",
            "nine",
            "bed",
            "bird",
            "cat",
            "dog",
            "happy",
            "house",
            "marvin",
            "sheila",
            "tree",
            "wow",
            "backward",
            "forward",
            "follow",
            "learn",
            "visual",
        ]
    else:
        raise ValueError("number_of_commands must be 12 or 35")

    # Data preparation
    run_on_main(
        prepare_GSC,
        kwargs={
            "data_folder": hparams["data_folder"],
            "save_folder": hparams["output_folder"],
            "validation_percentage": hparams["validation_percentage"],
            "testing_percentage": hparams["testing_percentage"],
            "percentage_unknown": hparams["percentage_unknown"],
            "percentage_silence": hparams["percentage_silence"],
            "words_wanted": words_wanted,
            "skip_prep": hparams["skip_prep"],
        },
    )
    sb.utils.distributed.run_on_main(hparams["prepare_noise_data"])
    sb.utils.distributed.run_on_main(hparams["prepare_rir_data"])

    # Dataset IO prep: creating Dataset objects and proper encodings for phones
    train_data, valid_data, test_data, label_encoder = dataio_prep(hparams)

    # Brain class initialization
    speaker_brain = SpeakerBrain(
        modules=hparams["modules"],
        opt_class=hparams["opt_class"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )

    # with torch.autograd.detect_anomaly():
    # Training
    speaker_brain.fit(
        speaker_brain.hparams.epoch_counter,
        train_data,
        valid_data,
        train_loader_kwargs=hparams["dataloader_options"],
        valid_loader_kwargs=hparams["dataloader_options"],
    )

    # Load the best checkpoint for evaluation
    test_stats = speaker_brain.evaluate(
        test_set=test_data,
        min_key="ErrorRate",
        test_loader_kwargs=hparams["dataloader_options"],
    )

Overwriting train_fbanks.py


#### Run

In [None]:
!python train_fbanks.py hparams_xvector_fbanks.yaml --data_folder=/path/to/GSC

  warn(f"Failed to load image Python extension: {e}")
speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: results/xvect_v12/1986
prepare_GSC - Extracting speech_commands_v0.02.tar.gz...
Downloading https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 to /path/to/GSC/noise/data.zip
noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1: 569MB [00:05, 108MB/s]         
Extracting /path/to/GSC/noise/data.zip to /path/to/GSC/noise
Downloading https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1 to /path/to/GSC/rir/data.zip
RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1: 246MB [00:04, 51.6MB/s]          
Extracting /path/to/GSC/rir/data.zip to /path/to/GSC/rir
speechbrain.dataio.encoder - Load called, but CategoricalEncoder is not empty. Loaded data will overwrite everything. This is normal if there is e.g. an unk label defined at init.
speechbrain.core - Info: ckpt_interval_minu

#### Visualization

**Fbanks does very good job in extracting and downsizing features from the audio signal. It does not overfit or underfit on the data as well. This only strengthens the case for continuous features in speech related tasks. The error rate dips to very low values. The final test error rate is 2%, which is amazing. They soon reach saturation near the end of 20 epochs.**

![image.png](attachment:4b1a1d4f-d152-4f37-baa6-bde4a0ec59fb.png)

# CONTINUOUS SELF-SUPERVISED FEATURES: WAV2VEC, HuBERT AND WAVLM

## WAV2VEC

**What is Wav2Vec and why use it?
Wav2Vec is a self-supervised framework for speech recognition. It performs exceedingly well on speech tasks especially keyword spotting. Firslty, all patterns are learned and require no handcrafted variables like filterbanks, making it robust. Moreover, they are pre-trained on large amounts of unlabelled audio data aiding them in generalization and superior performance in downstream tasks.**


**Fine-tuning the Conv layers has little to no effect. However, fine-tuning the remaining layers of Wav2Vec improves the performance of the system.**

#### train

In [None]:
%%file train_wav2vec.py

#!/usr/bin/python3
"""Recipe for training a classifier using the
Google Speech Commands v0.02 Dataset.

To run this recipe, use the following command:
> python train.py {hyperparameter_file}

Using your own hyperparameter file or one of the following:
    hyperparams/xvect.yaml (xvector system)

Author
    * Mirco Ravanelli 2020
    * Hwidong Na 2020
    * Nauman Dawalatabad 2020
    * Sarthak Yadav 2022
    Script adapted by David Raby-Pepin 2021
"""
import os
import sys
import torch
import torchaudio
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml

import speechbrain.nnet.CNN
from speechbrain.utils.distributed import run_on_main


class SpeakerBrain(sb.core.Brain):
    """Class for GSC training" """

    def compute_forward(self, batch, stage):
        """Computation pipeline based on a encoder + command classifier.
        Data augmentation and environmental corruption are applied to the
        input speech.
        """
        batch = batch.to(self.device)
        wavs, lens = batch.sig

        # Add waveform augmentation if specified.
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     wavs, lens = self.hparams.wav_augment(wavs, lens)

        # if isinstance(
        #     self.modules.compute_features, speechbrain.lobes.features.Leaf
        # ):
        #     # if leaf, first normalize the wavs before feeding them to leaf
        #     # no normalization is needed after LEAF
        #     feats = self.modules.mean_var_norm(wavs, lens)
        #     feats = self.modules.compute_features(feats)
        # else:
        #     # Feature extraction and normalization
        #     feats = self.modules.compute_features(wavs)
        #     feats = self.modules.mean_var_norm(feats, lens)

        outputs = self.modules.ssl_model(wavs, lens)

        # last dim will be used for AdaptativeAVG pool
        outputs = self.hparams.avg_pool(outputs, lens)
        outputs = outputs.view(outputs.shape[0], -1)

        outputs = self.modules.output_mlp(outputs)
        outputs = self.hparams.log_softmax(outputs)
        return outputs, lens


    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss using command-id as label."""
        predictions, lens = predictions
        uttid = batch.id
        command, _ = batch.command_encoded

        # Concatenate labels (due to data augmentation)
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     command = self.hparams.wav_augment.replicate_labels(command)

        # compute the cost function
        command = command.squeeze(1)
        loss = self.hparams.compute_cost(predictions, command)
        # loss = sb.nnet.losses.nll_loss(predictions, command, lens)

        if hasattr(self.hparams.lr_annealing, "on_batch_end"):
            self.hparams.lr_annealing.on_batch_end(self.optimizer)

        if stage != sb.Stage.TRAIN:
            self.error_metrics.append(uttid, predictions, command)

        return loss

    def on_stage_start(self, stage, epoch=None):
        """Gets called at the beginning of an epoch."""
        if stage != sb.Stage.TRAIN:
            self.error_metrics = self.hparams.error_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        """Gets called at the end of an epoch."""
        # Compute/store important stats
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["ErrorRate"] = self.error_metrics.summarize("average")

        # Perform end-of-iteration things, like annealing, logging, etc.
        if stage == sb.Stage.VALID:
            old_lr, new_lr = self.hparams.lr_annealing(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)

            (
                old_lr_ssl,
                new_lr_ssl,
            ) = self.hparams.lr_annealing_ssl(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(
                self.ssl_optimizer, new_lr_ssl
            )

            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch, "lr": old_lr, "ssl_lr": old_lr_ssl},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"ErrorRate": stage_stats["ErrorRate"]},
                min_keys=["ErrorRate"],
            )

        # We also write statistics about test data to stdout and to the logfile.
        if stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                {"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stage_stats,
            )

    def init_optimizers(self):
            "Initializes the ssl optimizer and model optimizer"
            self.ssl_optimizer = self.hparams.ssl_opt_class(
                self.modules.ssl_model.parameters()
            )
            self.optimizer = self.hparams.opt_class(self.hparams.model.parameters())

            if self.checkpointer is not None:
                self.checkpointer.add_recoverable(
                    "ssl_opt", self.ssl_optimizer
                )
                self.checkpointer.add_recoverable("optimizer", self.optimizer)

            self.optimizers_dict = {
                "model_optimizer": self.optimizer,
                "ssl_optimizer": self.ssl_optimizer,
            }


def dataio_prep(hparams):
    "Creates the datasets and their data processing pipelines."

    data_folder = hparams["data_folder"]

    # 1. Declarations:
    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["train_annotation"],
        replacements={"data_root": data_folder},
    )

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["valid_annotation"],
        replacements={"data_root": data_folder},
    )

    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["test_annotation"],
        replacements={"data_root": data_folder},
    )

    datasets = [train_data, valid_data, test_data]
    label_encoder = sb.dataio.encoder.CategoricalEncoder()

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav", "start", "stop", "duration")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav, start, stop, duration):
        start = int(start)
        stop = int(stop)
        num_frames = stop - start
        sig, fs = torchaudio.load(
            wav, num_frames=num_frames, frame_offset=start
        )
        sig = sig.transpose(0, 1).squeeze(1)
        return sig

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("command")
    @sb.utils.data_pipeline.provides("command", "command_encoded")
    def label_pipeline(command):
        yield command
        command_encoded = label_encoder.encode_sequence_torch([command])
        yield command_encoded

    sb.dataio.dataset.add_dynamic_item(datasets, label_pipeline)

    # 3. Fit encoder:
    # Load or compute the label encoder (with multi-GPU DDP support)
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load_or_create(
        path=lab_enc_file,
        from_didatasets=[train_data],
        output_key="command",
    )

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets, ["id", "sig", "command_encoded"]
    )

    return train_data, valid_data, test_data, label_encoder


if __name__ == "__main__":
    # This flag enables the inbuilt cudnn auto-tuner
    torch.backends.cudnn.benchmark = True

    # CLI:
    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])

    # Initialize ddp (useful only for multi-GPU DDP training)
    sb.utils.distributed.ddp_init_group(run_opts)

    # Load hyperparameters file with command-line overrides
    with open(hparams_file) as fin:
        hparams = load_hyperpyyaml(fin, overrides)

    # Create experiment directory
    sb.core.create_experiment_directory(
        experiment_directory=hparams["output_folder"],
        hyperparams_to_save=hparams_file,
        overrides=overrides,
    )

    # Dataset prep (parsing GSC and annotation into csv files)
    from prepare_GSC import prepare_GSC

    # Known words for V2 12 and V2 35 sets
    if hparams["number_of_commands"] == 12:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
        ]
    elif hparams["number_of_commands"] == 35:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
            "zero",
            "one",
            "two",
            "three",
            "four",
            "five",
            "six",
            "seven",
            "eight",
            "nine",
            "bed",
            "bird",
            "cat",
            "dog",
            "happy",
            "house",
            "marvin",
            "sheila",
            "tree",
            "wow",
            "backward",
            "forward",
            "follow",
            "learn",
            "visual",
        ]
    else:
        raise ValueError("number_of_commands must be 12 or 35")

    # Data preparation
    run_on_main(
        prepare_GSC,
        kwargs={
            "data_folder": hparams["data_folder"],
            "save_folder": hparams["output_folder"],
            "validation_percentage": hparams["validation_percentage"],
            "testing_percentage": hparams["testing_percentage"],
            "percentage_unknown": hparams["percentage_unknown"],
            "percentage_silence": hparams["percentage_silence"],
            "words_wanted": words_wanted,
            "skip_prep": hparams["skip_prep"],
        },
    )
    sb.utils.distributed.run_on_main(hparams["prepare_noise_data"])
    sb.utils.distributed.run_on_main(hparams["prepare_rir_data"])

    # Dataset IO prep: creating Dataset objects and proper encodings for phones
    train_data, valid_data, test_data, label_encoder = dataio_prep(hparams)
    # print(f"train_data type: {type(train_data)}")
    # print(f"dataset[0]: {train_data[0]}")
    # print(f"len of dataset: {len(train_data)}")
    # print(train_data[:5])
    # print(f"type of label_enc: {type(label_encoder)}")
    # print(f"dataset[0]: {label_encoder[0]}")
    # print(f"len of dataset: {len(label_encoder)}")
    # print(label_encoder[:5])

    # Brain class initialization
    speaker_brain = SpeakerBrain(
        modules=hparams["modules"],
        opt_class=hparams["opt_class"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )

    # with torch.autograd.detect_anomaly():
    # Training
    speaker_brain.fit(
        speaker_brain.hparams.epoch_counter,
        train_data,
        valid_data,
        train_loader_kwargs=hparams["dataloader_options"],
        valid_loader_kwargs=hparams["dataloader_options"],
    )

    # Load the best checkpoint for evaluation
    test_stats = speaker_brain.evaluate(
        test_set=test_data,
        min_key="ErrorRate",
        test_loader_kwargs=hparams["dataloader_options"],
    )

Overwriting train_wav2vec.py


### hparams - freeze all the layers of Wav2Vec

In [None]:
%%file hparams_wav2vec.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/wav2vec_3_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: facebook/wav2vec2-base
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 20
batch_size: 32
lr: 0.0001
lr_ssl: 0.00001
encoder_dim: 768

sample_rate: 16000
shuffle: True

#freeze all ssl
freeze_ssl: True
#set to true to freeze the CONV part of the ssl model
# We see an improvement of 2% with freezing CNNs
freeze_ssl_conv: True

# # Feature parameters
# n_mels: 24
# left_frames: 0
# right_frames: 0
# deltas: False

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
    source: !ref <sslmodel_hub>
    output_norm: True
    freeze: !ref <freeze_ssl>
    freeze_feature_extractor: !ref <freeze_ssl_conv>
    save_path: !ref <sslmodel_folder>

avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
    return_std: False

output_mlp: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <encoder_dim>
    n_neurons: !ref <out_n_neurons>
    bias: False

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    output_mlp: !ref <output_mlp>

model: !new:torch.nn.ModuleList
    - [!ref <output_mlp>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Writing hparams_wav2vec.yaml


#### Run

In [None]:
!rm -rf /content/results/wav2vec_3_v12/1986/

!python train_wav2vec.py hparams_wav2vec.yaml --data_folder=/path/to/GSC

#### Visualization

![image.png](attachment:99fb5e74-5235-4666-8863-fadf2c3d4576.png)

**The model does not overfit and generalizes well. The error rate also gradually decreases with the epoch count. However, it soon starts to pleateau around the 20th epoch. The final error rate on the test set is 16.1%.**

### hparams - fine-tune all the layers of Wav2Vec

In [None]:
%%file hparams_wav2vec.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/wav2vec_4_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: facebook/wav2vec2-base
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 20
batch_size: 32
lr: 0.0001
lr_ssl: 0.00001
encoder_dim: 768

sample_rate: 16000
shuffle: True

#freeze all ssl
freeze_ssl: False
#set to true to freeze the CONV part of the ssl model
# We see an improvement of 2% with freezing CNNs
freeze_ssl_conv: False

# # Feature parameters
# n_mels: 24
# left_frames: 0
# right_frames: 0
# deltas: False

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
    source: !ref <sslmodel_hub>
    output_norm: True
    freeze: !ref <freeze_ssl>
    freeze_feature_extractor: !ref <freeze_ssl_conv>
    save_path: !ref <sslmodel_folder>

avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
    return_std: False

output_mlp: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <encoder_dim>
    n_neurons: !ref <out_n_neurons>
    bias: False

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    output_mlp: !ref <output_mlp>

model: !new:torch.nn.ModuleList
    - [!ref <output_mlp>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Overwriting hparams_wav2vec.yaml


#### Run

In [None]:
!rm -rf /content/results/wav2vec_4_v12/1986/

!python train_wav2vec.py hparams_wav2vec.yaml --data_folder=/path/to/GSC

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  warn(f"Failed to load image Python extension: {e}")
speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: results/wav2vec_4_v12/1986
prepare_GSC - Extracting speech_commands_v0.02.tar.gz...
Downloading https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 to /path/to/GSC/noise/data.zip
noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1: 569MB [00:13, 42.5MB/s]        
Extracting /path/to/GSC/noise/data.zip to /path/to/GSC/noise
Downloading https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&d

#### Visualization

**The model seems to overfit slightly on the data even though we just use a simple linear layer as a classifier. This is because we are fine-tuning all the layers of Wav2Vec on the downstream task on a relatively small dataset. With more data, this problem can be tackled. Ideally, the training would have been stopped at epoch 5 or 6 but for clarity and understanding, we continue to train for 20 epochs. The error rate is not a smooth decline but it is reducing nonetheless. Through fine-tuning, it reaches an error rate of 1.65% on the test set which proves the effectiveness of Wav2Vec.**

![image.png](attachment:cb89a37c-a031-4db2-b26e-545f9042b119.png)

**It can be observed that fine-tuning Wav2Vec enhances the performance as compared to the model without fine-tuning. Disclaimer, fine-tuning can cause overfitting and should be done in moderation.**

![image.png](attachment:a1321def-5a52-4c40-824f-73bc40f1bfd5.png)

## HuBERT

**What is HuBERT and why use it?**
**Very similar to Wav2Vec in terms of architecture. Performs some more processing steps before training like k-means clustering on fbank features. Like Wav2Vec, it performs very well for speech tasks. It is a good competitor to Wav2Vec, hence experimented.**

**Fine-tuning the Conv layers has little to no effect. However, fine-tuning the remaining layers of HuBERT improves the performance of the system.**

#### train

In [None]:
%%file train_hubert.py

#!/usr/bin/python3
"""Recipe for training a classifier using the
Google Speech Commands v0.02 Dataset.

To run this recipe, use the following command:
> python train.py {hyperparameter_file}

Using your own hyperparameter file or one of the following:
    hyperparams/xvect.yaml (xvector system)

Author
    * Mirco Ravanelli 2020
    * Hwidong Na 2020
    * Nauman Dawalatabad 2020
    * Sarthak Yadav 2022
    Script adapted by David Raby-Pepin 2021
"""
import os
import sys
import torch
import torchaudio
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml

import speechbrain.nnet.CNN
from speechbrain.utils.distributed import run_on_main


class SpeakerBrain(sb.core.Brain):
    """Class for GSC training" """

    def compute_forward(self, batch, stage):
        """Computation pipeline based on a encoder + command classifier.
        Data augmentation and environmental corruption are applied to the
        input speech.
        """
        batch = batch.to(self.device)
        wavs, lens = batch.sig

        # Add waveform augmentation if specified.
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     wavs, lens = self.hparams.wav_augment(wavs, lens)

        # if isinstance(
        #     self.modules.compute_features, speechbrain.lobes.features.Leaf
        # ):
        #     # if leaf, first normalize the wavs before feeding them to leaf
        #     # no normalization is needed after LEAF
        #     feats = self.modules.mean_var_norm(wavs, lens)
        #     feats = self.modules.compute_features(feats)
        # else:
        #     # Feature extraction and normalization
        #     feats = self.modules.compute_features(wavs)
        #     feats = self.modules.mean_var_norm(feats, lens)

        outputs = self.modules.ssl_model(wavs, lens)

        # last dim will be used for AdaptativeAVG pool
        outputs = self.hparams.avg_pool(outputs, lens)
        outputs = outputs.view(outputs.shape[0], -1)

        outputs = self.modules.output_mlp(outputs)
        outputs = self.hparams.log_softmax(outputs)
        return outputs, lens


    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss using command-id as label."""
        predictions, lens = predictions
        uttid = batch.id
        command, _ = batch.command_encoded

        # Concatenate labels (due to data augmentation)
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     command = self.hparams.wav_augment.replicate_labels(command)

        # compute the cost function
        command = command.squeeze(1)
        loss = self.hparams.compute_cost(predictions, command)
        # loss = sb.nnet.losses.nll_loss(predictions, command, lens)

        if hasattr(self.hparams.lr_annealing, "on_batch_end"):
            self.hparams.lr_annealing.on_batch_end(self.optimizer)

        if stage != sb.Stage.TRAIN:
            self.error_metrics.append(uttid, predictions, command)

        return loss

    def on_stage_start(self, stage, epoch=None):
        """Gets called at the beginning of an epoch."""
        if stage != sb.Stage.TRAIN:
            self.error_metrics = self.hparams.error_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        """Gets called at the end of an epoch."""
        # Compute/store important stats
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["ErrorRate"] = self.error_metrics.summarize("average")

        # Perform end-of-iteration things, like annealing, logging, etc.
        if stage == sb.Stage.VALID:
            old_lr, new_lr = self.hparams.lr_annealing(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)

            (
                old_lr_ssl,
                new_lr_ssl,
            ) = self.hparams.lr_annealing_ssl(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(
                self.ssl_optimizer, new_lr_ssl
            )

            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch, "lr": old_lr, "ssl_lr": old_lr_ssl},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"ErrorRate": stage_stats["ErrorRate"]},
                min_keys=["ErrorRate"],
            )

        # We also write statistics about test data to stdout and to the logfile.
        if stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                {"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stage_stats,
            )

    def init_optimizers(self):
            "Initializes the ssl optimizer and model optimizer"
            self.ssl_optimizer = self.hparams.ssl_opt_class(
                self.modules.ssl_model.parameters()
            )
            self.optimizer = self.hparams.opt_class(self.hparams.model.parameters())

            if self.checkpointer is not None:
                self.checkpointer.add_recoverable(
                    "ssl_opt", self.ssl_optimizer
                )
                self.checkpointer.add_recoverable("optimizer", self.optimizer)

            self.optimizers_dict = {
                "model_optimizer": self.optimizer,
                "ssl_optimizer": self.ssl_optimizer,
            }


def dataio_prep(hparams):
    "Creates the datasets and their data processing pipelines."

    data_folder = hparams["data_folder"]

    # 1. Declarations:
    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["train_annotation"],
        replacements={"data_root": data_folder},
    )

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["valid_annotation"],
        replacements={"data_root": data_folder},
    )

    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["test_annotation"],
        replacements={"data_root": data_folder},
    )

    datasets = [train_data, valid_data, test_data]
    label_encoder = sb.dataio.encoder.CategoricalEncoder()

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav", "start", "stop", "duration")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav, start, stop, duration):
        start = int(start)
        stop = int(stop)
        num_frames = stop - start
        sig, fs = torchaudio.load(
            wav, num_frames=num_frames, frame_offset=start
        )
        sig = sig.transpose(0, 1).squeeze(1)
        return sig

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("command")
    @sb.utils.data_pipeline.provides("command", "command_encoded")
    def label_pipeline(command):
        yield command
        command_encoded = label_encoder.encode_sequence_torch([command])
        yield command_encoded

    sb.dataio.dataset.add_dynamic_item(datasets, label_pipeline)

    # 3. Fit encoder:
    # Load or compute the label encoder (with multi-GPU DDP support)
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load_or_create(
        path=lab_enc_file,
        from_didatasets=[train_data],
        output_key="command",
    )

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets, ["id", "sig", "command_encoded"]
    )

    return train_data, valid_data, test_data, label_encoder


if __name__ == "__main__":
    # This flag enables the inbuilt cudnn auto-tuner
    torch.backends.cudnn.benchmark = True

    # CLI:
    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])

    # Initialize ddp (useful only for multi-GPU DDP training)
    sb.utils.distributed.ddp_init_group(run_opts)

    # Load hyperparameters file with command-line overrides
    with open(hparams_file) as fin:
        hparams = load_hyperpyyaml(fin, overrides)

    # Create experiment directory
    sb.core.create_experiment_directory(
        experiment_directory=hparams["output_folder"],
        hyperparams_to_save=hparams_file,
        overrides=overrides,
    )

    # Dataset prep (parsing GSC and annotation into csv files)
    from prepare_GSC import prepare_GSC

    # Known words for V2 12 and V2 35 sets
    if hparams["number_of_commands"] == 12:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
        ]
    elif hparams["number_of_commands"] == 35:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
            "zero",
            "one",
            "two",
            "three",
            "four",
            "five",
            "six",
            "seven",
            "eight",
            "nine",
            "bed",
            "bird",
            "cat",
            "dog",
            "happy",
            "house",
            "marvin",
            "sheila",
            "tree",
            "wow",
            "backward",
            "forward",
            "follow",
            "learn",
            "visual",
        ]
    else:
        raise ValueError("number_of_commands must be 12 or 35")

    # Data preparation
    run_on_main(
        prepare_GSC,
        kwargs={
            "data_folder": hparams["data_folder"],
            "save_folder": hparams["output_folder"],
            "validation_percentage": hparams["validation_percentage"],
            "testing_percentage": hparams["testing_percentage"],
            "percentage_unknown": hparams["percentage_unknown"],
            "percentage_silence": hparams["percentage_silence"],
            "words_wanted": words_wanted,
            "skip_prep": hparams["skip_prep"],
        },
    )
    sb.utils.distributed.run_on_main(hparams["prepare_noise_data"])
    sb.utils.distributed.run_on_main(hparams["prepare_rir_data"])

    # Dataset IO prep: creating Dataset objects and proper encodings for phones
    train_data, valid_data, test_data, label_encoder = dataio_prep(hparams)
    # print(f"train_data type: {type(train_data)}")
    # print(f"dataset[0]: {train_data[0]}")
    # print(f"len of dataset: {len(train_data)}")
    # print(train_data[:5])
    # print(f"type of label_enc: {type(label_encoder)}")
    # print(f"dataset[0]: {label_encoder[0]}")
    # print(f"len of dataset: {len(label_encoder)}")
    # print(label_encoder[:5])

    # Brain class initialization
    speaker_brain = SpeakerBrain(
        modules=hparams["modules"],
        opt_class=hparams["opt_class"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )

    # with torch.autograd.detect_anomaly():
    # Training
    speaker_brain.fit(
        speaker_brain.hparams.epoch_counter,
        train_data,
        valid_data,
        train_loader_kwargs=hparams["dataloader_options"],
        valid_loader_kwargs=hparams["dataloader_options"],
    )

    # Load the best checkpoint for evaluation
    test_stats = speaker_brain.evaluate(
        test_set=test_data,
        min_key="ErrorRate",
        test_loader_kwargs=hparams["dataloader_options"],
    )

Overwriting train_hubert.py


### hparams - freeze Conv layers and fine-tune the remaining layers of the self-supervised features

In [None]:
%%file hparams_hubert.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/hubert_1_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: facebook/hubert-base-ls960
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 5
batch_size: 32
lr: 0.0001
lr_ssl: 0.00001
lr_final: 0.0001
encoder_dim: 768

sample_rate: 16000
shuffle: True

#freeze all ssl
freeze_ssl: False
#set to true to freeze the CONV part of the ssl model
# We see an improvement of 2% with freezing CNNs
freeze_ssl_conv: True

# # Feature parameters
# n_mels: 24
# left_frames: 0
# right_frames: 0
# deltas: False

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
    source: !ref <sslmodel_hub>
    output_norm: True
    freeze: !ref <freeze_ssl>
    freeze_feature_extractor: !ref <freeze_ssl_conv>
    save_path: !ref <sslmodel_folder>

avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
    return_std: False

output_mlp: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <encoder_dim>
    n_neurons: !ref <out_n_neurons>
    bias: False

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    output_mlp: !ref <output_mlp>

model: !new:torch.nn.ModuleList
    - [!ref <output_mlp>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Overwriting hparams_hubert.yaml


#### Run

In [None]:
!rm -rf /content/results/encodec_v12/1986/

!python train_hubert.py hparams_hubert.yaml --data_folder=/path/to/GSC

config.json: 100%|██████████████████████████| 1.39k/1.39k [00:00<00:00, 258kB/s]
pytorch_model.bin: 100%|██████████████████████| 378M/378M [00:03<00:00, 111MB/s]
Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.or

### hparams - fine-tune the Conv layers and freeze the remaining layers of the self-supervised features

In [None]:
%%file hparams_hubert.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/hubert_2_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: facebook/hubert-base-ls960
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 5
batch_size: 32
lr: 0.0001
lr_ssl: 0.00001
lr_final: 0.0001
encoder_dim: 768

sample_rate: 16000
shuffle: True

#freeze all ssl
freeze_ssl: True
#set to true to freeze the CONV part of the ssl model
# We see an improvement of 2% with freezing CNNs
freeze_ssl_conv: False

# # Feature parameters
# n_mels: 24
# left_frames: 0
# right_frames: 0
# deltas: False

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
    source: !ref <sslmodel_hub>
    output_norm: True
    freeze: !ref <freeze_ssl>
    freeze_feature_extractor: !ref <freeze_ssl_conv>
    save_path: !ref <sslmodel_folder>

avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
    return_std: False

output_mlp: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <encoder_dim>
    n_neurons: !ref <out_n_neurons>
    bias: False

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    output_mlp: !ref <output_mlp>

model: !new:torch.nn.ModuleList
    - [!ref <output_mlp>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Overwriting hparams_hubert.yaml


#### Run

In [None]:
!rm -rf /content/results/encodec_v12/1986/

!python train_hubert.py hparams_hubert.yaml --data_folder=/path/to/GSC

config.json: 100%|██████████████████████████| 1.39k/1.39k [00:00<00:00, 284kB/s]
pytorch_model.bin: 100%|██████████████████████| 378M/378M [00:02<00:00, 169MB/s]
Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.or

### hparams - freeze all the layers of the self-supervised features

In [None]:
%%file hparams_hubert.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/hubert_3_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: facebook/hubert-base-ls960
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 20
batch_size: 32
lr: 0.0001
lr_ssl: 0.00001
lr_final: 0.0001
encoder_dim: 768

sample_rate: 16000
shuffle: True

#freeze all ssl
freeze_ssl: True
#set to true to freeze the CONV part of the ssl model
# We see an improvement of 2% with freezing CNNs
freeze_ssl_conv: True

# # Feature parameters
# n_mels: 24
# left_frames: 0
# right_frames: 0
# deltas: False

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
    source: !ref <sslmodel_hub>
    output_norm: True
    freeze: !ref <freeze_ssl>
    freeze_feature_extractor: !ref <freeze_ssl_conv>
    save_path: !ref <sslmodel_folder>

avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
    return_std: False

output_mlp: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <encoder_dim>
    n_neurons: !ref <out_n_neurons>
    bias: False

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    output_mlp: !ref <output_mlp>

model: !new:torch.nn.ModuleList
    - [!ref <output_mlp>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Overwriting hparams_hubert.yaml


#### Run

In [None]:
# !rm -rf /content/results/encodec_v12/1986/

!python train_hubert.py hparams_hubert.yaml --data_folder=/path/to/GSC

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

#### Visualization

![image.png](attachment:0ad5b2bb-c53f-481b-bd57-d8ea413d6c56.png)

**The model does not overfit and generalizes well. The error rate also gradually decreases with the epoch count. However, it soon starts to pleateau around the 20th epoch. The final error rate on the test set is 6.97%.**

### hparams - fine-tune all the layers of the self-supervised features

In [None]:
%%file hparams_hubert.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/hubert_4_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: facebook/hubert-base-ls960
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 20
batch_size: 32
lr: 0.0001
lr_ssl: 0.00001
lr_final: 0.0001
encoder_dim: 768

sample_rate: 16000
shuffle: True

#freeze all ssl
freeze_ssl: False
#set to true to freeze the CONV part of the ssl model
# We see an improvement of 2% with freezing CNNs
freeze_ssl_conv: False

# # Feature parameters
# n_mels: 24
# left_frames: 0
# right_frames: 0
# deltas: False

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.hubert.HuBERT
    source: !ref <sslmodel_hub>
    output_norm: True
    freeze: !ref <freeze_ssl>
    freeze_feature_extractor: !ref <freeze_ssl_conv>
    save_path: !ref <sslmodel_folder>

avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
    return_std: False

output_mlp: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <encoder_dim>
    n_neurons: !ref <out_n_neurons>
    bias: False

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    output_mlp: !ref <output_mlp>

model: !new:torch.nn.ModuleList
    - [!ref <output_mlp>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Overwriting hparams_hubert.yaml


#### Run

In [None]:
# !rm -rf /content/results/encodec_v12/1986/

!python train_hubert.py hparams_hubert.yaml --data_folder=/path/to/GSC

Some weights of the model checkpoint at facebook/hubert-base-ls960 were not used when initializing HubertModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertModel were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

#### Visualization

**Similar to Wav2Vec, the model seems to overfit slightly on the data even though we just use a simple linear layer as a classifier. This is because we are fine-tuning all the layers of HuBERT on the downstream task on a relatively small dataset. With more data, this problem can be tackled. Again, ideally the training would stop at the 5th epoch. The error rate is not a smooth decline but it is reducing nonetheless. Through fine-tuning, it reaches an error rate of 1.23% on the test set on HuBERT.**

![image.png](attachment:4280191b-ac2a-4d3c-8122-003cb5144555.png)

**It can be observed that fine-tuning HuBERT enhances the performance as compared to the model without fine-tuning. Disclaimer, fine-tuning can cause overfitting and should be done in moderation.**

![image.png](attachment:1fd30b7a-5471-424d-b317-1a1ff1821bc6.png)

## WAVLM

**What is WavLM and why use it?**
**A successor to Wav2Vec as well, WavLM is the newest and best performing self-supervised feature extractor. It has been pre-trained on a larger corpus and some additions to the transfomer segment were made. This shows the evolution of feature extractors in order.**

**WavLM specifically is trained only for 5 epochs. This is because they reach near optimal accuracies within a few epochs. There on, we face the problem of overfitting.**

#### train

In [None]:
%%file train_wavlm.py

#!/usr/bin/python3
"""Recipe for training a classifier using the
Google Speech Commands v0.02 Dataset.

To run this recipe, use the following command:
> python train.py {hyperparameter_file}

Using your own hyperparameter file or one of the following:
    hyperparams/xvect.yaml (xvector system)

Author
    * Mirco Ravanelli 2020
    * Hwidong Na 2020
    * Nauman Dawalatabad 2020
    * Sarthak Yadav 2022
    Script adapted by David Raby-Pepin 2021
"""
import os
import sys
import torch
import torchaudio
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml

import speechbrain.nnet.CNN
from speechbrain.utils.distributed import run_on_main


class SpeakerBrain(sb.core.Brain):
    """Class for GSC training" """

    def compute_forward(self, batch, stage):
        """Computation pipeline based on a encoder + command classifier.
        Data augmentation and environmental corruption are applied to the
        input speech.
        """
        batch = batch.to(self.device)
        wavs, lens = batch.sig

        # Add waveform augmentation if specified.
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     wavs, lens = self.hparams.wav_augment(wavs, lens)

        # if isinstance(
        #     self.modules.compute_features, speechbrain.lobes.features.Leaf
        # ):
        #     # if leaf, first normalize the wavs before feeding them to leaf
        #     # no normalization is needed after LEAF
        #     feats = self.modules.mean_var_norm(wavs, lens)
        #     feats = self.modules.compute_features(feats)
        # else:
        #     # Feature extraction and normalization
        #     feats = self.modules.compute_features(wavs)
        #     feats = self.modules.mean_var_norm(feats, lens)

        outputs = self.modules.ssl_model(wavs, lens)

        # last dim will be used for AdaptativeAVG pool
        outputs = self.hparams.avg_pool(outputs, lens)
        outputs = outputs.view(outputs.shape[0], -1)

        outputs = self.modules.output_mlp(outputs)
        outputs = self.hparams.log_softmax(outputs)
        return outputs, lens


    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss using command-id as label."""
        predictions, lens = predictions
        uttid = batch.id
        command, _ = batch.command_encoded

        # Concatenate labels (due to data augmentation)
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     command = self.hparams.wav_augment.replicate_labels(command)

        # compute the cost function
        command = command.squeeze(1)
        loss = self.hparams.compute_cost(predictions, command)
        # loss = sb.nnet.losses.nll_loss(predictions, command, lens)

        if hasattr(self.hparams.lr_annealing, "on_batch_end"):
            self.hparams.lr_annealing.on_batch_end(self.optimizer)

        if stage != sb.Stage.TRAIN:
            self.error_metrics.append(uttid, predictions, command)

        return loss

    def on_stage_start(self, stage, epoch=None):
        """Gets called at the beginning of an epoch."""
        if stage != sb.Stage.TRAIN:
            self.error_metrics = self.hparams.error_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        """Gets called at the end of an epoch."""
        # Compute/store important stats
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["ErrorRate"] = self.error_metrics.summarize("average")

        # Perform end-of-iteration things, like annealing, logging, etc.
        if stage == sb.Stage.VALID:
            old_lr, new_lr = self.hparams.lr_annealing(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)

            (
                old_lr_ssl,
                new_lr_ssl,
            ) = self.hparams.lr_annealing_ssl(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(
                self.ssl_optimizer, new_lr_ssl
            )

            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch, "lr": old_lr, "ssl_lr": old_lr_ssl},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"ErrorRate": stage_stats["ErrorRate"]},
                min_keys=["ErrorRate"],
            )

        # We also write statistics about test data to stdout and to the logfile.
        if stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                {"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stage_stats,
            )

    def init_optimizers(self):
            "Initializes the ssl optimizer and model optimizer"
            self.ssl_optimizer = self.hparams.ssl_opt_class(
                self.modules.ssl_model.parameters()
            )
            self.optimizer = self.hparams.opt_class(self.hparams.model.parameters())

            if self.checkpointer is not None:
                self.checkpointer.add_recoverable(
                    "ssl_opt", self.ssl_optimizer
                )
                self.checkpointer.add_recoverable("optimizer", self.optimizer)

            self.optimizers_dict = {
                "model_optimizer": self.optimizer,
                "ssl_optimizer": self.ssl_optimizer,
            }


def dataio_prep(hparams):
    "Creates the datasets and their data processing pipelines."

    data_folder = hparams["data_folder"]

    # 1. Declarations:
    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["train_annotation"],
        replacements={"data_root": data_folder},
    )

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["valid_annotation"],
        replacements={"data_root": data_folder},
    )

    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["test_annotation"],
        replacements={"data_root": data_folder},
    )

    datasets = [train_data, valid_data, test_data]
    label_encoder = sb.dataio.encoder.CategoricalEncoder()

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav", "start", "stop", "duration")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav, start, stop, duration):
        start = int(start)
        stop = int(stop)
        num_frames = stop - start
        sig, fs = torchaudio.load(
            wav, num_frames=num_frames, frame_offset=start
        )
        sig = sig.transpose(0, 1).squeeze(1)
        return sig

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("command")
    @sb.utils.data_pipeline.provides("command", "command_encoded")
    def label_pipeline(command):
        yield command
        command_encoded = label_encoder.encode_sequence_torch([command])
        yield command_encoded

    sb.dataio.dataset.add_dynamic_item(datasets, label_pipeline)

    # 3. Fit encoder:
    # Load or compute the label encoder (with multi-GPU DDP support)
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load_or_create(
        path=lab_enc_file,
        from_didatasets=[train_data],
        output_key="command",
    )

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets, ["id", "sig", "command_encoded"]
    )

    return train_data, valid_data, test_data, label_encoder


if __name__ == "__main__":
    # This flag enables the inbuilt cudnn auto-tuner
    torch.backends.cudnn.benchmark = True

    # CLI:
    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])

    # Initialize ddp (useful only for multi-GPU DDP training)
    sb.utils.distributed.ddp_init_group(run_opts)

    # Load hyperparameters file with command-line overrides
    with open(hparams_file) as fin:
        hparams = load_hyperpyyaml(fin, overrides)

    # Create experiment directory
    sb.core.create_experiment_directory(
        experiment_directory=hparams["output_folder"],
        hyperparams_to_save=hparams_file,
        overrides=overrides,
    )

    # Dataset prep (parsing GSC and annotation into csv files)
    from prepare_GSC import prepare_GSC

    # Known words for V2 12 and V2 35 sets
    if hparams["number_of_commands"] == 12:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
        ]
    elif hparams["number_of_commands"] == 35:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
            "zero",
            "one",
            "two",
            "three",
            "four",
            "five",
            "six",
            "seven",
            "eight",
            "nine",
            "bed",
            "bird",
            "cat",
            "dog",
            "happy",
            "house",
            "marvin",
            "sheila",
            "tree",
            "wow",
            "backward",
            "forward",
            "follow",
            "learn",
            "visual",
        ]
    else:
        raise ValueError("number_of_commands must be 12 or 35")

    # Data preparation
    run_on_main(
        prepare_GSC,
        kwargs={
            "data_folder": hparams["data_folder"],
            "save_folder": hparams["output_folder"],
            "validation_percentage": hparams["validation_percentage"],
            "testing_percentage": hparams["testing_percentage"],
            "percentage_unknown": hparams["percentage_unknown"],
            "percentage_silence": hparams["percentage_silence"],
            "words_wanted": words_wanted,
            "skip_prep": hparams["skip_prep"],
        },
    )
    sb.utils.distributed.run_on_main(hparams["prepare_noise_data"])
    sb.utils.distributed.run_on_main(hparams["prepare_rir_data"])

    # Dataset IO prep: creating Dataset objects and proper encodings for phones
    train_data, valid_data, test_data, label_encoder = dataio_prep(hparams)
    # print(f"train_data type: {type(train_data)}")
    # print(f"dataset[0]: {train_data[0]}")
    # print(f"len of dataset: {len(train_data)}")
    # print(train_data[:5])
    # print(f"type of label_enc: {type(label_encoder)}")
    # print(f"dataset[0]: {label_encoder[0]}")
    # print(f"len of dataset: {len(label_encoder)}")
    # print(label_encoder[:5])

    # Brain class initialization
    speaker_brain = SpeakerBrain(
        modules=hparams["modules"],
        opt_class=hparams["opt_class"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )

    # with torch.autograd.detect_anomaly():
    # Training
    speaker_brain.fit(
        speaker_brain.hparams.epoch_counter,
        train_data,
        valid_data,
        train_loader_kwargs=hparams["dataloader_options"],
        valid_loader_kwargs=hparams["dataloader_options"],
    )

    # Load the best checkpoint for evaluation
    test_stats = speaker_brain.evaluate(
        test_set=test_data,
        min_key="ErrorRate",
        test_loader_kwargs=hparams["dataloader_options"],
    )

Overwriting train_wavlm.py


### hparams - freeze Conv layers and fine-tune the remaining layers of the self-supervised features

In [None]:
%%file hparams_wavlm.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/wavlm_1_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: microsoft/wavlm-large
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 5
batch_size: 32
lr: 0.0001
lr_ssl: 0.00001
lr_final: 0.0001
encoder_dim: 1024

sample_rate: 16000
shuffle: True

#freeze all ssl
freeze_ssl: False
#set to true to freeze the CONV part of the ssl model
# We see an improvement of 2% with freezing CNNs
freeze_ssl_conv: True

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
    source: !ref <sslmodel_hub>
    output_norm: True
    freeze: !ref <freeze_ssl>
    freeze_feature_extractor: !ref <freeze_ssl_conv>
    save_path: !ref <sslmodel_folder>

avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
    return_std: False

output_mlp: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <encoder_dim>
    n_neurons: !ref <out_n_neurons>
    bias: False

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    output_mlp: !ref <output_mlp>

model: !new:torch.nn.ModuleList
    - [!ref <output_mlp>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Writing hparams_wavlm.yaml


#### Run

In [None]:
!rm -rf /content/results/encodec_v12/1986/

!python train_wavlm.py hparams_wavlm.yaml --data_folder=/path/to/GSC

config.json: 100%|██████████████████████████| 2.22k/2.22k [00:00<00:00, 392kB/s]
pytorch_model.bin: 100%|███████████████████| 1.26G/1.26G [00:21<00:00, 59.8MB/s]
Some weights of the model checkpoint at microsoft/wavlm-large were not used when initializing WavLMModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMModel were not initialized from the model checkpoint at microsoft/wavlm-large and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'enc

#### Visualization

![image.png](attachment:bf81ee16-e01d-4e4e-929c-4bcae01e0a65.png)

### hparams - fine-tune the Conv layers and freeze the remaining layers of the self-supervised features

In [None]:
%%file hparams_wavlm.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/wavlm_2_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: microsoft/wavlm-large
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 5
batch_size: 32
lr: 0.0001
lr_ssl: 0.00001
lr_final: 0.0001
encoder_dim: 1024

sample_rate: 16000
shuffle: True

#freeze all ssl
freeze_ssl: True
#set to true to freeze the CONV part of the ssl model
# We see an improvement of 2% with freezing CNNs
freeze_ssl_conv: False

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
    source: !ref <sslmodel_hub>
    output_norm: True
    freeze: !ref <freeze_ssl>
    freeze_feature_extractor: !ref <freeze_ssl_conv>
    save_path: !ref <sslmodel_folder>

avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
    return_std: False

output_mlp: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <encoder_dim>
    n_neurons: !ref <out_n_neurons>
    bias: False

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    output_mlp: !ref <output_mlp>

model: !new:torch.nn.ModuleList
    - [!ref <output_mlp>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Overwriting hparams_wavlm.yaml


#### Run

In [None]:
!rm -rf /content/results/encodec_v12/1986/

!python train_wavlm.py hparams_wavlm.yaml --data_folder=/path/to/GSC

config.json: 100%|██████████████████████████| 2.22k/2.22k [00:00<00:00, 258kB/s]
pytorch_model.bin: 100%|████████████████████| 1.26G/1.26G [00:03<00:00, 329MB/s]
Some weights of the model checkpoint at microsoft/wavlm-large were not used when initializing WavLMModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMModel were not initialized from the model checkpoint at microsoft/wavlm-large and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'enc

### hparams - freeze all the layers of the self-supervised features

In [None]:
%%file hparams_wavlm.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/wavlm_3_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: microsoft/wavlm-large
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 5
batch_size: 32
lr: 0.0001
lr_ssl: 0.00001
lr_final: 0.0001
encoder_dim: 1024

sample_rate: 16000
shuffle: True

#freeze all ssl
freeze_ssl: True
#set to true to freeze the CONV part of the ssl model
# We see an improvement of 2% with freezing CNNs
freeze_ssl_conv: True

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
    source: !ref <sslmodel_hub>
    output_norm: True
    freeze: !ref <freeze_ssl>
    freeze_feature_extractor: !ref <freeze_ssl_conv>
    save_path: !ref <sslmodel_folder>

avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
    return_std: False

output_mlp: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <encoder_dim>
    n_neurons: !ref <out_n_neurons>
    bias: False

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    output_mlp: !ref <output_mlp>

model: !new:torch.nn.ModuleList
    - [!ref <output_mlp>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Overwriting hparams_wavlm.yaml


#### Run

In [None]:
!rm -rf /content/results/encodec_v12/1986/

!python train_wavlm.py hparams_wavlm.yaml --data_folder=/path/to/GSC

Some weights of the model checkpoint at microsoft/wavlm-large were not used when initializing WavLMModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMModel were not initialized from the model checkpoint at microsoft/wavlm-large and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and i

#### Visualization

**Even with fine-tuning just the output linear layer, it achieves error rates between 5 to 3%. This tells us that WavLM has been trained well and is naturally good at extracting continuous features from audio signals. The final test error rate is 2.5% which is very close to the fine-tuned version of Wav2Vec.**

![image.png](attachment:36351f0c-e40f-4436-b6b7-0b60438325bc.png)

### hparams - fine-tune all the layers of the self-supervised features

In [None]:
%%file hparams_wavlm.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/wavlm_4_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: microsoft/wavlm-large
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 5
batch_size: 32
lr: 0.0001
lr_ssl: 0.00001
lr_final: 0.0001
encoder_dim: 1024

sample_rate: 16000
shuffle: True

#freeze all ssl
freeze_ssl: False
#set to true to freeze the CONV part of the ssl model
# We see an improvement of 2% with freezing CNNs
freeze_ssl_conv: False

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.wavlm.WavLM
    source: !ref <sslmodel_hub>
    output_norm: True
    freeze: !ref <freeze_ssl>
    freeze_feature_extractor: !ref <freeze_ssl_conv>
    save_path: !ref <sslmodel_folder>

avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
    return_std: False

output_mlp: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <encoder_dim>
    n_neurons: !ref <out_n_neurons>
    bias: False

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    output_mlp: !ref <output_mlp>

model: !new:torch.nn.ModuleList
    - [!ref <output_mlp>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Overwriting hparams_wavlm.yaml


#### Run

In [None]:
!rm -rf /content/results/encodec_v12/1986/

!python train_wavlm.py hparams_wavlm.yaml --data_folder=/path/to/GSC

Some weights of the model checkpoint at microsoft/wavlm-large were not used when initializing WavLMModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMModel were not initialized from the model checkpoint at microsoft/wavlm-large and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and i

#### Visualization

**WavLM performs exceedingly well within a few epochs. The error rate is not smooth but decreases in time. The final test error rate is 0.76% which is amazing.**

![image.png](attachment:38cb8211-4778-46c6-9df3-98960d38b3d1.png)

#### Wav2Vev vs HuBERT

**Wav2Vec and WavLM show close results. Wav2Vec seems to converge faster but they both end up at the same error rates later on.**

![image.png](attachment:970514cb-5475-4a98-a6b4-60b654dec369.png)

# DISCRETE FEATURES WITH ENCODEC AND DAC

**What are discrete features? Why are they needed?**
**Discrete features in Speech Recognition contain distinct, categorical data about the audio signals. They give us linguistic information like phenomes or words. The main reason why we want to utilize discrete features is due to their compactness. They are more data efficient and robust. We will mainly use Encodec for the experimentation.**

## Embedding for Encodec discrete features

In [None]:
%%file custom_model.py

import torch

class AttentionMLP(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(AttentionMLP, self).__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(input_dim, hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim, 1, bias=False),
        )

    def forward(self, x):
        x = self.layers(x)
        att_w = torch.nn.functional.softmax(x, dim=2)
        return att_w


class Discrete_EmbeddingLayer(torch.nn.Module):
    """This class handles embedding layers  for discrete tokens.

    Arguments
    ---------
    num_codebooks: int ,
        number of codebooks of the tokenizer.
    vocab_size : int,
        size of the dictionary of embeddings
    emb_dim: int ,
        the size of each embedding vector
    pad_index: int (default: 0),
        If specified, the entries at padding_idx do not contribute to the gradient.
    init: boolean (default: False):
        If set to True, init the embedding with the tokenizer embedding otherwise init randomly.
    freeze: boolean (default: False)
       If True, the embedding is frozen. If False, the model will be trained
        alongside with the rest of the pipeline.

    Example
    -------
    >>> from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec
    >>> model_hub = "facebook/encodec_24khz"
    >>> save_path = "savedir"
    >>> model = Encodec(model_hub, save_path)
    >>> audio = torch.randn(4, 1000)
    >>> length = torch.tensor([1.0, .5, .75, 1.0])
    >>> tokens, emb = model.encode(audio, length)
    >>> print(tokens.shape)
    torch.Size([4, 4, 2])
    >>> emb= Discrete_EmbeddingLayer(2, 1024, 1024)
    >>> in_emb = emb(tokens)
    >>> print(in_emb.shape)
    torch.Size([4, 4, 2, 1024])
    """

    def __init__(
        self,
        num_codebooks,
        vocab_size,
        emb_dim,
        pad_index=0,
        init=False,
        freeze=False,
    ):
        super(Discrete_EmbeddingLayer, self).__init__()
        self.vocab_size = vocab_size
        self.num_codebooks = num_codebooks
        self.freeze = freeze
        self.embedding = torch.nn.Embedding(
            num_codebooks * vocab_size, emb_dim
        ).requires_grad_(not self.freeze)
        self.init= init


    def init_embedding(self,weights):
        with torch.no_grad():
            self.embedding.weight = torch.nn.Parameter(weights)

    def forward(self, in_tokens):
        """Computes the embedding for discrete tokens.
        a sample.

        Arguments
        ---------
        in_tokens : torch.Tensor
            A (Batch x Time x num_codebooks)
            audio sample
        Returns
        -------
        in_embs : torch.Tensor
        """
        with torch.set_grad_enabled(not self.freeze):
            #  Add unique token IDs across diffrent codebooks by adding num_codebooks * vocab_size
            in_tokens += torch.arange(
                0,
                self.num_codebooks * self.vocab_size,
                self.vocab_size,
                device=in_tokens.device,
            )
            # Forward Pass to embedding and
            in_embs = self.embedding(in_tokens)
            return in_embs

Writing custom_model.py


## ENCODEC

**Freezing or fine-tuning Encodec results in the same performance. So we shall freeze the Encodec's layers during this notebook.**

## hparams - Xvector

**Xvector intuitively will not perform well because it is just a CNN module applied on our extracted features. It just learns the local dependencies and lose out on the global dependencies. We choose this model to set low standards and compare how RNNs and Transformers perform vis-a-vis.**

**Experimented with multiple combinations of convolution layers and kernels. The error rate does not reduce substantially but we can preclude overfitting.**

In [None]:
%%file hparams_encodec.yaml

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/encodec_Xvector_<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: facebook/encodec_24khz
sslmodel_folder: !ref <save_folder>/ssl_checkpoint


# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequencies for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 20
batch_size: 32
lr: 0.0001
lr_ssl: 0.00001

sample_rate: 24000
shuffle: True

#freeze all ssl
freeze_ssl: True
#set to true to freeze the CONV part of the ssl model
# We see an improvement of 2% with freezing CNNs
# freeze_ssl_conv: True

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 4
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>


# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.encodec.Encodec
    source: !ref <sslmodel_hub>
    sample_rate: !ref <sample_rate>
    bandwidth: 6.0
    freeze: !ref <freeze_ssl>
    flat_embeddings: False
    renorm_embeddings: True
    save_path: !ref <sslmodel_folder>

num_codebooks: 8
num_clusters: 1024
encoder_dim: 256

discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer
   num_codebooks: !ref <num_codebooks>
   vocab_size: !ref <num_clusters>
   emb_dim: !ref <encoder_dim>

attention_mlp: !new:custom_model.AttentionMLP
    input_dim: !ref <encoder_dim>
    hidden_dim: !ref <encoder_dim>

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
    return_std: False
# Xvector model parameters

# emb_dim: 64 #128

#Xvector Model
embedding_model: !new:speechbrain.lobes.models.Xvector.Xvector
    in_channels: !ref <encoder_dim>
    activation: !name:torch.nn.LeakyReLU
    tdnn_blocks: 1 #2
    tdnn_channels: [64, 64]
    tdnn_kernel_sizes: [3, 3]
    tdnn_dilations: [ 1, 1]
    lin_neurons: !ref <encoder_dim>

# Clasifier applied on top of the embeddings
classifier: !new:speechbrain.lobes.models.Xvector.Classifier
    input_shape: [null, null, !ref <encoder_dim>]
    activation: !name:torch.nn.LeakyReLU
    lin_blocks: 1
    lin_neurons: !ref <encoder_dim>
    out_neurons: !ref <out_n_neurons>


log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]


modules:
    ssl_model: !ref <ssl_model>
    embedding_model: !ref <embedding_model>
    classifier: !ref <classifier>
    discrete_embedding_layer: !ref <discrete_embedding_layer>
    attention_mlp: !ref <attention_mlp>
    mean_var_norm: !ref <mean_var_norm>

model: !new:torch.nn.ModuleList
    - [!ref <ssl_model>, !ref <embedding_model> ,!ref <classifier>]


# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing_output: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Writing hparams_encodec.yaml


### train

In [None]:
%%file train_encodec.py
#!/usr/bin/python3
"""Recipe for training a classifier using the
Google Speech Commands v0.02 Dataset.

To run this recipe, use the following command:
> python train.py {hyperparameter_file}

Using your own hyperparameter file or one of the following:
    hyperparams/xvect.yaml (xvector system)

Author
    * Mirco Ravanelli 2020
    * Hwidong Na 2020
    * Nauman Dawalatabad 2020
    * Sarthak Yadav 2022
    Script adapted by David Raby-Pepin 2021
"""
import os
import sys
import torch
import torchaudio
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml
import torchaudio
# import librosa

import speechbrain.nnet.CNN
from speechbrain.utils.distributed import run_on_main


class SpeakerBrain(sb.core.Brain):
    """Class for GSC training"
    """

    def compute_forward(self, batch, stage):
        """Computation pipeline based on a encoder + command classifier.
        Data augmentation and environmental corruption are applied to the
        input speech.
        """
        batch = batch.to(self.device)
        wavs, lens = batch.sig

        # Add waveform augmentation if specified.
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     wavs, lens = self.hparams.wav_augment(wavs, lens)
        # print(f' \n computefeature type :{type(self.modules.compute_features)}')
        # if   isinstance(
        #     self.modules.compute_features, speechbrain.lobes.features.Leaf
        # ):
        #     # if leaf, first normalize the wavs before feeding them to leaf
        #     # no normalization is needed after LEAF
        #     feats = self.modules.mean_var_norm(wavs, lens)
        #     feats = self.modules.compute_features(feats)
        # else:
            # Feature extraction and normalization
        with torch.no_grad():
          self.modules.ssl_model.to(self.device).eval()
          tokens,_ = self.modules.ssl_model.encode(wavs, lens)
          # print(f"wavs shape: {wavs.shape}")
          # print(f"tokens shape: {tokens.shape}")

        input_embeddings = self.modules.discrete_embedding_layer(tokens)
        # input_embeddings shape: torch.Size([5, 693, 8, 512])

        input_att_w = self.modules.attention_mlp(input_embeddings)
        input_feats = torch.matmul(input_att_w.transpose(2, -1), input_embeddings).squeeze(-2)
        # tokens= tokens.float()
        outputs = self.hparams.mean_var_norm(input_feats, lens)
        embeddings = self.modules.embedding_model(outputs, lens)
        # embeddings = self.hparams.avg_pool(embeddings,lens)
        predictions = self.modules.classifier(embeddings)


        # # Ecapa model uses softmax outside of its classifer
        # if "softmax" in self.modules.keys():
        #     outputs = self.modules.softmax(outputs)

        return predictions, lens

    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss using command-id as label.
        """
        predictions, lens = predictions
        uttid = batch.id
        command, _ = batch.command_encoded

        # Concatenate labels (due to data augmentation)
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     command = self.hparams.wav_augment.replicate_labels(command)

        # command = command.squeeze(1)
        # compute the cost function
        loss = self.hparams.compute_cost(predictions, command, lens)
        # loss = sb.nnet.losses.nll_loss(predictions, command, lens)

        if hasattr(self.hparams.lr_annealing, "on_batch_end"):
            self.hparams.lr_annealing.on_batch_end(self.optimizer)

        if stage != sb.Stage.TRAIN:
            self.error_metrics.append(uttid, predictions, command,lens)

        return loss

    def on_stage_start(self, stage, epoch=None):
        """Gets called at the beginning of an epoch."""
        if stage != sb.Stage.TRAIN:
            self.error_metrics = self.hparams.error_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        """Gets called at the end of an epoch."""
        # Compute/store important stats
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["ErrorRate"] = self.error_metrics.summarize("average")

        # Perform end-of-iteration things, like annealing, logging, etc.
        if stage == sb.Stage.VALID:
            old_lr, new_lr = self.hparams.lr_annealing(epoch)
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)

            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch, "lr": old_lr},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"ErrorRate": stage_stats["ErrorRate"]},
                min_keys=["ErrorRate"],
            )

        # We also write statistics about test data to stdout and to the logfile.
        if stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                {"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stage_stats,
            )
    def init_optimizers(self):
      "Initializes the ssl optimizer and model optimizer"
      self.ssl_optimizer = self.hparams.ssl_opt_class(
          self.modules.ssl_model.parameters()
      )
      self.optimizer = self.hparams.opt_class(self.hparams.model.parameters())

      if self.checkpointer is not None:
          self.checkpointer.add_recoverable(
              "ssl_opt", self.ssl_optimizer
          )
          self.checkpointer.add_recoverable("optimizer", self.optimizer)

      self.optimizers_dict = {
          "model_optimizer": self.optimizer,
          "ssl_optimizer": self.ssl_optimizer,
      }


def dataio_prep(hparams):
    "Creates the datasets and their data processing pipelines."

    data_folder = hparams["data_folder"]

    # 1. Declarations:
    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["train_annotation"],
        replacements={"data_root": data_folder},
    )

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["valid_annotation"],
        replacements={"data_root": data_folder},
    )

    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["test_annotation"],
        replacements={"data_root": data_folder},
    )

    datasets = [train_data, valid_data, test_data]
    label_encoder = sb.dataio.encoder.CategoricalEncoder()

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav", "start", "stop", "duration")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav, start, stop, duration, target_sr=24000):
        start = int(start)
        stop = int(stop)
        num_frames = stop - start
        sig, fs = torchaudio.load(
            wav, num_frames=num_frames, frame_offset=start
        )
        sig = sig.transpose(0, 1).squeeze(1)

        return sig

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("command")
    @sb.utils.data_pipeline.provides("command", "command_encoded")
    def label_pipeline(command):
        yield command
        command_encoded = label_encoder.encode_sequence_torch([command])
        yield command_encoded

    sb.dataio.dataset.add_dynamic_item(datasets, label_pipeline)

    # 3. Fit encoder:
    # Load or compute the label encoder (with multi-GPU DDP support)
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load_or_create(
        path=lab_enc_file, from_didatasets=[train_data], output_key="command",
    )

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets, ["id", "sig", "command_encoded"]
    )

    return train_data, valid_data, test_data, label_encoder


if __name__ == "__main__":

    # This flag enables the inbuilt cudnn auto-tuner
    torch.backends.cudnn.benchmark = True

    # CLI:
    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])

    # Initialize ddp (useful only for multi-GPU DDP training)
    sb.utils.distributed.ddp_init_group(run_opts)

    # Load hyperparameters file with command-line overrides
    with open(hparams_file) as fin:
        hparams = load_hyperpyyaml(fin, overrides)

    # Create experiment directory
    sb.core.create_experiment_directory(
        experiment_directory=hparams["output_folder"],
        hyperparams_to_save=hparams_file,
        overrides=overrides,
    )

    # Dataset prep (parsing GSC and annotation into csv files)
    from prepare_GSC import prepare_GSC

    # Known words for V2 12 and V2 35 sets
    if hparams["number_of_commands"] == 12:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
        ]
    elif hparams["number_of_commands"] == 35:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
            "zero",
            "one",
            "two",
            "three",
            "four",
            "five",
            "six",
            "seven",
            "eight",
            "nine",
            "bed",
            "bird",
            "cat",
            "dog",
            "happy",
            "house",
            "marvin",
            "sheila",
            "tree",
            "wow",
            "backward",
            "forward",
            "follow",
            "learn",
            "visual",
        ]
    else:
        raise ValueError("number_of_commands must be 12 or 35")

    # Data preparation
    run_on_main(
        prepare_GSC,
        kwargs={
            "data_folder": hparams["data_folder"],
            "save_folder": hparams["output_folder"],
            "validation_percentage": hparams["validation_percentage"],
            "testing_percentage": hparams["testing_percentage"],
            "percentage_unknown": hparams["percentage_unknown"],
            "percentage_silence": hparams["percentage_silence"],
            "words_wanted": words_wanted,
            "skip_prep": hparams["skip_prep"],
        },
    )
    # sb.utils.distributed.run_on_main(hparams["prepare_noise_data"])
    # sb.utils.distributed.run_on_main(hparams["prepare_rir_data"])

    # Dataset IO prep: creating Dataset objects and proper encodings for phones
    train_data, valid_data, test_data, label_encoder = dataio_prep(hparams)

    # Brain class initialization
    speaker_brain = SpeakerBrain(
        modules=hparams["modules"],
        opt_class=hparams["opt_class"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )

    # with torch.autograd.detect_anomaly():
    # Training
    speaker_brain.fit(
        speaker_brain.hparams.epoch_counter,
        train_data,
        valid_data,
        train_loader_kwargs=hparams["dataloader_options"],
        valid_loader_kwargs=hparams["dataloader_options"],
    )

    # Load the best checkpoint for evaluation
    test_stats = speaker_brain.evaluate(
        test_set=test_data,
        min_key="ErrorRate",
        test_loader_kwargs=hparams["dataloader_options"],
    )

Writing train_encodec.py


# for 12.0 - 16 codebooks
# for 6.0 - 8 codebooks


### Run



In [None]:
# import warnings
# warnings.filterwarnings('ignore')

!rm -rf /content/results/encodec_Xvector_12/1986/

!python train_encodec.py hparams_encodec.yaml --data_folder=/path/to/GSC

config.json: 100% 809/809 [00:00<00:00, 5.23MB/s]
model.safetensors: 100% 93.1M/93.1M [00:02<00:00, 34.2MB/s]
  WeightNorm.apply(module, name, dim)
  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
speechbrain.lobes.models.huggingface_transformers.huggingface - EncodecModel is frozen.
huggingface_Encodec - Encodec is frozen.
speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: results/encodec_Xvector_12/1986
numexpr.utils - NumExpr defaulting to 2 threads.
Downloading http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz to /path/to/GSC/speech_commands_v0.02.tar.gz
speech_commands_v0.02.tar.gz: 2.43GB [01:59, 20.4MB/s]                
Extracting /path/to/GSC/speech_commands_v0.02.tar.gz to /path/to/GSC
speechbrain.dataio.encoder - Load called, but CategoricalEncoder is not empty. Loaded data will overwrite everything. This is normal if there is e.g. an unk label defined at init.
speechbr

#### Visualization

**As said earlier, we see that Xvector overfits on the input data. The error rate also is pretty high since it does not capture long term dependencies. The final test error rate is around 73%. With this, we know that Xvector or CNN by itself is not a good idea for audio signals.**

![image.png](attachment:3e3bb128-44c3-4666-947f-a8164e32da6e.png)

### train with augmentation

In [None]:
%%file train_encodec.py
#!/usr/bin/python3
"""Recipe for training a classifier using the
Google Speech Commands v0.02 Dataset.

To run this recipe, use the following command:
> python train.py {hyperparameter_file}

Using your own hyperparameter file or one of the following:
    hyperparams/xvect.yaml (xvector system)

Author
    * Mirco Ravanelli 2020
    * Hwidong Na 2020
    * Nauman Dawalatabad 2020
    * Sarthak Yadav 2022
    Script adapted by David Raby-Pepin 2021
"""
import os
import sys
import torch
import torchaudio
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml
import torchaudio
# import librosa

import speechbrain.nnet.CNN
from speechbrain.utils.distributed import run_on_main


class SpeakerBrain(sb.core.Brain):
    """Class for GSC training"
    """

    def compute_forward(self, batch, stage):
        """Computation pipeline based on a encoder + command classifier.
        Data augmentation and environmental corruption are applied to the
        input speech.
        """
        batch = batch.to(self.device)
        wavs, lens = batch.sig

        # Add waveform augmentation if specified.
        if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
            wavs, lens = self.hparams.wav_augment(wavs, lens)
        # print(f' \n computefeature type :{type(self.modules.ssl_model)}')
        if   isinstance(
            self.modules.ssl_model, speechbrain.lobes.features.Leaf
        ):
            # if leaf, first normalize the wavs before feeding them to leaf
            # no normalization is needed after LEAF
            feats = self.hparams.mean_var_norm(wavs, lens)
            tokens,_ = self.modules.ssl_model(feats, lens)
            tokens = tokens.float()
        else:
            tokens,_ = self.modules.ssl_model(wavs, lens)
            tokens = tokens.float()
            tokens = self.hparams.mean_var_norm(tokens, lens)

        embeddings = self.modules.embedding_model(tokens, lens)
        predictions = self.modules.classifier(embeddings)


        # # Ecapa model uses softmax outside of its classifer
        if "softmax" in self.modules.keys():
            outputs = self.modules.softmax(outputs)

        return predictions, lens

    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss using command-id as label.
        """
        predictions, lens = predictions
        uttid = batch.id
        command, _ = batch.command_encoded
        # print(f'command shape :{command.shape}')
        # print(f'/n/n predictions shpae :{predictions.shape} and lens : {lens.shape}')
        # Concatenate labels (due to data augmentation)
        if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
            command = self.hparams.wav_augment.replicate_labels(command)

        # command = command.squeeze(1)
        # compute the cost function
        loss = self.hparams.compute_cost(predictions, command, lens)
        # loss = sb.nnet.losses.nll_loss(predictions, command, lens)

        if hasattr(self.hparams.lr_annealing, "on_batch_end"):
            self.hparams.lr_annealing.on_batch_end(self.optimizer)

        if stage != sb.Stage.TRAIN:
            self.error_metrics.append(uttid, predictions, command,lens)

        return loss

    def on_stage_start(self, stage, epoch=None):
        """Gets called at the beginning of an epoch."""
        if stage != sb.Stage.TRAIN:
            self.error_metrics = self.hparams.error_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        """Gets called at the end of an epoch."""
        # Compute/store important stats
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["ErrorRate"] = self.error_metrics.summarize("average")

        # Perform end-of-iteration things, like annealing, logging, etc.
        if stage == sb.Stage.VALID:
            old_lr, new_lr = self.hparams.lr_annealing(epoch)
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)

            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch, "lr": old_lr},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"ErrorRate": stage_stats["ErrorRate"]},
                min_keys=["ErrorRate"],
            )

        # We also write statistics about test data to stdout and to the logfile.
        if stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                {"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stage_stats,
            )
    def init_optimizers(self):
      "Initializes the ssl optimizer and model optimizer"
      self.ssl_optimizer = self.hparams.ssl_opt_class(
          self.modules.ssl_model.parameters()
      )
      self.optimizer = self.hparams.opt_class(self.hparams.model.parameters())

      if self.checkpointer is not None:
          self.checkpointer.add_recoverable(
              "ssl_opt", self.ssl_optimizer
          )
          self.checkpointer.add_recoverable("optimizer", self.optimizer)

      self.optimizers_dict = {
          "model_optimizer": self.optimizer,
          "ssl_optimizer": self.ssl_optimizer,
      }


def dataio_prep(hparams):
    "Creates the datasets and their data processing pipelines."

    data_folder = hparams["data_folder"]

    # 1. Declarations:
    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["train_annotation"],
        replacements={"data_root": data_folder},
    )

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["valid_annotation"],
        replacements={"data_root": data_folder},
    )

    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["test_annotation"],
        replacements={"data_root": data_folder},
    )

    datasets = [train_data, valid_data, test_data]
    label_encoder = sb.dataio.encoder.CategoricalEncoder()

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav", "start", "stop", "duration")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav, start, stop, duration,target_sr=24000):
        start = int(start)
        stop = int(stop)
        num_frames = stop - start
        sig, fs = torchaudio.load(
            wav, num_frames=num_frames, frame_offset=start
        )
        sig = sig.transpose(0, 1).squeeze(1)

        return sig

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("command")
    @sb.utils.data_pipeline.provides("command", "command_encoded")
    def label_pipeline(command):
        yield command
        command_encoded = label_encoder.encode_sequence_torch([command])
        yield command_encoded

    sb.dataio.dataset.add_dynamic_item(datasets, label_pipeline)

    # 3. Fit encoder:
    # Load or compute the label encoder (with multi-GPU DDP support)
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load_or_create(
        path=lab_enc_file, from_didatasets=[train_data], output_key="command",
    )

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets, ["id", "sig", "command_encoded"]
    )

    return train_data, valid_data, test_data, label_encoder


if __name__ == "__main__":

    # This flag enables the inbuilt cudnn auto-tuner
    torch.backends.cudnn.benchmark = True

    # CLI:
    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])

    # Initialize ddp (useful only for multi-GPU DDP training)
    sb.utils.distributed.ddp_init_group(run_opts)

    # Load hyperparameters file with command-line overrides
    with open(hparams_file) as fin:
        hparams = load_hyperpyyaml(fin, overrides)

    # Create experiment directory
    sb.core.create_experiment_directory(
        experiment_directory=hparams["output_folder"],
        hyperparams_to_save=hparams_file,
        overrides=overrides,
    )

    # Dataset prep (parsing GSC and annotation into csv files)
    from prepare_GSC import prepare_GSC

    # Known words for V2 12 and V2 35 sets
    if hparams["number_of_commands"] == 12:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
        ]
    elif hparams["number_of_commands"] == 35:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
            "zero",
            "one",
            "two",
            "three",
            "four",
            "five",
            "six",
            "seven",
            "eight",
            "nine",
            "bed",
            "bird",
            "cat",
            "dog",
            "happy",
            "house",
            "marvin",
            "sheila",
            "tree",
            "wow",
            "backward",
            "forward",
            "follow",
            "learn",
            "visual",
        ]
    else:
        raise ValueError("number_of_commands must be 12 or 35")

    # Data preparation
    run_on_main(
        prepare_GSC,
        kwargs={
            "data_folder": hparams["data_folder"],
            "save_folder": hparams["output_folder"],
            "validation_percentage": hparams["validation_percentage"],
            "testing_percentage": hparams["testing_percentage"],
            "percentage_unknown": hparams["percentage_unknown"],
            "percentage_silence": hparams["percentage_silence"],
            "words_wanted": words_wanted,
            "skip_prep": hparams["skip_prep"],
        },
    )
    sb.utils.distributed.run_on_main(hparams["prepare_noise_data"])
    sb.utils.distributed.run_on_main(hparams["prepare_rir_data"])

    # Dataset IO prep: creating Dataset objects and proper encodings for phones
    train_data, valid_data, test_data, label_encoder = dataio_prep(hparams)

    # Brain class initialization
    speaker_brain = SpeakerBrain(
        modules=hparams["modules"],
        opt_class=hparams["opt_class"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )

    # with torch.autograd.detect_anomaly():
    # Training
    speaker_brain.fit(
        speaker_brain.hparams.epoch_counter,
        train_data,
        valid_data,
        train_loader_kwargs=hparams["dataloader_options"],
        valid_loader_kwargs=hparams["dataloader_options"],
    )

    # Load the best checkpoint for evaluation
    test_stats = speaker_brain.evaluate(
        test_set=test_data,
        min_key="ErrorRate",
        test_loader_kwargs=hparams["dataloader_options"],
    )

Overwriting train_encodec.py


### Run

In [None]:
!python train_encodec.py hparams_encodec.yaml --data_folder=/path/to/GSC

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
speechbrain.lobes.models.huggingface_transformers.huggingface - EncodecModel is frozen.
huggingface_Encodec - Encodec is frozen.
  warn(f"Failed to load image Python extension: {e}")
speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: results/encodec_aug_Xvector12/1986
prepare_GSC - Extracting speech_commands_v0.02.tar.gz...
/path/to/GSC/noise/data.zip exists. Skipping download
/path/to/GSC/rir/data.zip exists. Skipping download
speechbrain.dataio.encoder - Load called, but CategoricalEncoder is not empty. Loaded data will overwrite everything. This is normal if there is e.g. an unk label defined at init.
speechbrain.core - Info: ckpt_interval_minutes arg from hparam file is used
speechbrain.core - Gradscaler enabled: False. Using precision: fp32.
speechbrain.core - SpeakerBrain Model Statistics:
* Total Number of Trainable Parameters: 29.2k
* Tota

In [None]:
%%file custom_model.py

import torch

class AttentionMLP(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(AttentionMLP, self).__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(input_dim, hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim, 1, bias=False),
        )

    def forward(self, x):
        x = self.layers(x)
        att_w = torch.nn.functional.softmax(x, dim=2)
        return att_w


class Discrete_EmbeddingLayer(torch.nn.Module):
    """This class handles embedding layers  for discrete tokens.

    Arguments
    ---------
    num_codebooks: int ,
        number of codebooks of the tokenizer.
    vocab_size : int,
        size of the dictionary of embeddings
    emb_dim: int ,
        the size of each embedding vector
    pad_index: int (default: 0),
        If specified, the entries at padding_idx do not contribute to the gradient.
    init: boolean (default: False):
        If set to True, init the embedding with the tokenizer embedding otherwise init randomly.
    freeze: boolean (default: False)
       If True, the embedding is frozen. If False, the model will be trained
        alongside with the rest of the pipeline.

    Example
    -------
    >>> from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec
    >>> model_hub = "facebook/encodec_24khz"
    >>> save_path = "savedir"
    >>> model = Encodec(model_hub, save_path)
    >>> audio = torch.randn(4, 1000)
    >>> length = torch.tensor([1.0, .5, .75, 1.0])
    >>> tokens, emb = model.encode(audio, length)
    >>> print(tokens.shape)
    torch.Size([4, 4, 2])
    >>> emb= Discrete_EmbeddingLayer(2, 1024, 1024)
    >>> in_emb = emb(tokens)
    >>> print(in_emb.shape)
    torch.Size([4, 4, 2, 1024])
    """

    def __init__(
        self,
        num_codebooks,
        vocab_size,
        emb_dim,
        pad_index=0,
        init=False,
        freeze=False,
    ):
        super(Discrete_EmbeddingLayer, self).__init__()
        self.vocab_size = vocab_size
        self.num_codebooks = num_codebooks
        self.freeze = freeze
        self.embedding = torch.nn.Embedding(
            num_codebooks * vocab_size, emb_dim
        ).requires_grad_(not self.freeze)
        self.init= init


    def init_embedding(self,weights):
        with torch.no_grad():
            self.embedding.weight = torch.nn.Parameter(weights)

    def forward(self, in_tokens):
        """Computes the embedding for discrete tokens.
        a sample.

        Arguments
        ---------
        in_tokens : torch.Tensor
            A (Batch x Time x num_codebooks)
            audio sample
        Returns
        -------
        in_embs : torch.Tensor
        """
        with torch.set_grad_enabled(not self.freeze):
            #  Add unique token IDs across diffrent codebooks by adding num_codebooks * vocab_size
            in_tokens += torch.arange(
                0,
                self.num_codebooks * self.vocab_size,
                self.vocab_size,
                device=in_tokens.device,
            )
            # Forward Pass to embedding and
            in_embs = self.embedding(in_tokens)
            return in_embs

## hparams - LSTM

**What is LSTM? Why use it?**
**LSTM is a type of RNN used for sequential data. Unlike the traditional RNN, LSTMs can fight off vanishing gradients better. They can handle long-term dependencies, making them an ideal candidate for speech tasks. Many combinations were tried and tested. Some features that were tuned are batch_size: 32, 64; learning rate: 0.0001-0.0005; hidden_size: 64, 128, 256; number of cells: 2, 3, 4; to name a few. Intuitively they should perform good and way better than Xvectors.**

In [None]:
%%file hparams_lstm_encodec.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/encodec_rnn_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: facebook/encodec_24khz
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 20
batch_size: 32 #64 #128
lr: 0.00025 #0.0001
lr_ssl: 0.00001
hidden: 128 #64 #256
linear_dim: 256 # 6750 #4800 #9600 #19200 # #38400 #
emb_size: 512
dropout: 0.5
num_codebooks: 8
num_clusters: 1024
encoder_dim: 128

sample_rate: 24000
shuffle: True

#freeze all ssl
freeze: True

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.encodec.Encodec
    source: !ref <sslmodel_hub>
    bandwidth: 6 #12
    sample_rate: !ref <sample_rate>
    flat_embeddings: False
    freeze: !ref <freeze>
    renorm_embeddings: True
    save_path: !ref <sslmodel_folder>

discrete_embedding_layer: !new:custom_model.Discrete_EmbeddingLayer
   num_codebooks: !ref <num_codebooks>
   vocab_size: !ref <num_clusters>
   emb_dim: !ref <encoder_dim>

attention_mlp: !new:custom_model.AttentionMLP
   input_dim: !ref <encoder_dim>
   hidden_dim: !ref <encoder_dim>


lstm: !new:speechbrain.nnet.RNN.LSTM
        input_size: !ref <encoder_dim>
        bidirectional: False #True
        hidden_size: !ref <hidden>
        num_layers: 2 #3
        dropout: !ref <dropout>
        re_init: True

linear_1: !new:speechbrain.nnet.linear.Linear
        input_size: !ref <linear_dim>
        n_neurons: !ref <out_n_neurons>
        bias: False

# linear_2: !new:speechbrain.nnet.linear.Linear
#         input_size: 100 #64
#         n_neurons: !ref <out_n_neurons>
#         bias: False

avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
    return_std: True

dropout_layer: !new:speechbrain.nnet.dropout.Dropout2d
          drop_rate: !ref <dropout>

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    lstm: !ref <lstm>
    linear_1: !ref <linear_1>
    # linear_2: !ref <linear_2>
    log_softmax: !ref <log_softmax>
    discrete_embedding_layer: !ref <discrete_embedding_layer>
    attention_mlp: !ref <attention_mlp>

model: !new:torch.nn.ModuleList
    - [!ref <ssl_model>, !ref <lstm>, !ref <linear_1>, !ref <log_softmax>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Writing hparams_lstm_encodec.yaml


### train

In [None]:
%%file train_lstm_encodec.py

#!/usr/bin/python3
"""Recipe for training a classifier using the
Google Speech Commands v0.02 Dataset.


To run this recipe, use the following command:
> python train.py {hyperparameter_file}

Using your own hyperparameter file or one of the following:
    hyperparams/xvect.yaml (xvector system)

Author
    * Mirco Ravanelli 2020
    * Hwidong Na 2020
    * Nauman Dawalatabad 2020
    * Sarthak Yadav 2022
    Script adapted by David Raby-Pepin 2021
"""
import os
import sys
import torch
import torchaudio
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml

import speechbrain.nnet.CNN
from speechbrain.utils.distributed import run_on_main


class SpeakerBrain(sb.core.Brain):
    """Class for GSC training" """

    def compute_forward(self, batch, stage):
        """Computation pipeline based on a encoder + command classifier.
        Data augmentation and environmental corruption are applied to the
        input speech.
        """
        batch = batch.to(self.device)
        wavs, lens = batch.sig

        # Add waveform augmentation if specified.
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     wavs, lens = self.hparams.wav_augment(wavs, lens)

        # if isinstance(
        #     self.modules.compute_features, speechbrain.lobes.features.Leaf
        # ):
        #     # if leaf, first normalize the wavs before feeding them to leaf
        #     # no normalization is needed after LEAF
        #     feats = self.modules.mean_var_norm(wavs, lens)
        #     feats = self.modules.compute_features(feats)
        # else:
        #     # Feature extraction and normalization
        #     feats = self.modules.compute_features(wavs)
        #     feats = self.modules.mean_var_norm(feats, lens)

        # print(f"wavs shape, {wavs.shape}")
        tokens, embeddings = self.modules.ssl_model.encode(wavs, lens)
        # print("tokens shape, ", tokens.shape)
        embeddings = self.modules.discrete_embedding_layer(tokens)
        # print("embeddings shape, ", embeddings.shape)
        att_w = self.modules.attention_mlp(embeddings)
        feats = torch.matmul(att_w.transpose(2, -1), embeddings).squeeze(-2)
        # print(f"feats shape, {feats.shape}")
        # tokens= tokens.float()

        # outputs = self.hparams.mean_var_norm(tokens, lens)
        # print(f"after mean_var_norm, ", outputs.shape)
        outputs, _ = self.modules.lstm(feats)
        # print(f"after lstm, ", outputs.shape)
        outputs = self.hparams.avg_pool(outputs, lens)
        # print(f"after pooling, ", outputs.shape)
        outputs = outputs.reshape(outputs.shape[0], -1)
        # print(f"after pooling and reshape, ", outputs.shape)
        outputs = self.modules.linear_1(outputs)
        # outputs = self.modules.linear_2(outputs)
        # print(f"after linear_1, ", outputs.shape)

        outputs = self.hparams.log_softmax(outputs)
        return outputs, lens


    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss using command-id as label."""
        predictions, lens = predictions
        uttid = batch.id
        command, _ = batch.command_encoded

        # Concatenate labels (due to data augmentation)
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     command = self.hparams.wav_augment.replicate_labels(command)

        # compute the cost function
        command = command.squeeze(1)
        loss = self.hparams.compute_cost(predictions, command)
        # loss = sb.nnet.losses.nll_loss(predictions, command, lens)

        if hasattr(self.hparams.lr_annealing, "on_batch_end"):
            self.hparams.lr_annealing.on_batch_end(self.optimizer)

        if stage != sb.Stage.TRAIN:
            self.error_metrics.append(uttid, predictions, command)

        return loss

    def on_stage_start(self, stage, epoch=None):
        """Gets called at the beginning of an epoch."""
        if stage != sb.Stage.TRAIN:
            self.error_metrics = self.hparams.error_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        """Gets called at the end of an epoch."""
        # Compute/store important stats
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["ErrorRate"] = self.error_metrics.summarize("average")

        # Perform end-of-iteration things, like annealing, logging, etc.
        if stage == sb.Stage.VALID:
            old_lr, new_lr = self.hparams.lr_annealing(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)

            (
                old_lr_ssl,
                new_lr_ssl,
            ) = self.hparams.lr_annealing_ssl(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(
                self.ssl_optimizer, new_lr_ssl
            )

            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch, "lr": old_lr, "ssl_lr": old_lr_ssl},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"ErrorRate": stage_stats["ErrorRate"]},
                min_keys=["ErrorRate"],
            )

        # We also write statistics about test data to stdout and to the logfile.
        if stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                {"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stage_stats,
            )

    def init_optimizers(self):
            "Initializes the ssl optimizer and model optimizer"
            self.ssl_optimizer = self.hparams.ssl_opt_class(
                self.modules.ssl_model.parameters()
            )
            self.optimizer = self.hparams.opt_class(self.hparams.model.parameters())

            if self.checkpointer is not None:
                self.checkpointer.add_recoverable(
                    "ssl_opt", self.ssl_optimizer
                )
                self.checkpointer.add_recoverable("optimizer", self.optimizer)

            self.optimizers_dict = {
                "model_optimizer": self.optimizer,
                "ssl_optimizer": self.ssl_optimizer,
            }


def dataio_prep(hparams):
    "Creates the datasets and their data processing pipelines."

    data_folder = hparams["data_folder"]

    # 1. Declarations:
    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["train_annotation"],
        replacements={"data_root": data_folder},
    )

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["valid_annotation"],
        replacements={"data_root": data_folder},
    )

    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["test_annotation"],
        replacements={"data_root": data_folder},
    )

    datasets = [train_data, valid_data, test_data]
    label_encoder = sb.dataio.encoder.CategoricalEncoder()

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav", "start", "stop", "duration")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav, start, stop, duration):
        start = int(start)
        stop = int(stop)
        num_frames = stop - start
        sig, fs = torchaudio.load(
            wav, num_frames=num_frames, frame_offset=start
        )
        sig = torchaudio.transforms.Resample(fs, 24000)(sig)
        sig = sig.transpose(0, 1).squeeze(1)
        return sig

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("command")
    @sb.utils.data_pipeline.provides("command", "command_encoded")
    def label_pipeline(command):
        yield command
        command_encoded = label_encoder.encode_sequence_torch([command])
        yield command_encoded

    sb.dataio.dataset.add_dynamic_item(datasets, label_pipeline)

    # 3. Fit encoder:
    # Load or compute the label encoder (with multi-GPU DDP support)
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load_or_create(
        path=lab_enc_file,
        from_didatasets=[train_data],
        output_key="command",
    )

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets, ["id", "sig", "command_encoded"]
    )

    return train_data, valid_data, test_data, label_encoder


if __name__ == "__main__":
    # This flag enables the inbuilt cudnn auto-tuner
    torch.backends.cudnn.benchmark = True

    # CLI:
    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])

    # Initialize ddp (useful only for multi-GPU DDP training)
    sb.utils.distributed.ddp_init_group(run_opts)

    # Load hyperparameters file with command-line overrides
    with open(hparams_file) as fin:
        hparams = load_hyperpyyaml(fin, overrides)

    # Create experiment directory
    sb.core.create_experiment_directory(
        experiment_directory=hparams["output_folder"],
        hyperparams_to_save=hparams_file,
        overrides=overrides,
    )

    # Dataset prep (parsing GSC and annotation into csv files)
    from prepare_GSC import prepare_GSC

    # Known words for V2 12 and V2 35 sets
    if hparams["number_of_commands"] == 12:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
        ]
    elif hparams["number_of_commands"] == 35:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
            "zero",
            "one",
            "two",
            "three",
            "four",
            "five",
            "six",
            "seven",
            "eight",
            "nine",
            "bed",
            "bird",
            "cat",
            "dog",
            "happy",
            "house",
            "marvin",
            "sheila",
            "tree",
            "wow",
            "backward",
            "forward",
            "follow",
            "learn",
            "visual",
        ]
    else:
        raise ValueError("number_of_commands must be 12 or 35")

    # Data preparation
    run_on_main(
        prepare_GSC,
        kwargs={
            "data_folder": hparams["data_folder"],
            "save_folder": hparams["output_folder"],
            "validation_percentage": hparams["validation_percentage"],
            "testing_percentage": hparams["testing_percentage"],
            "percentage_unknown": hparams["percentage_unknown"],
            "percentage_silence": hparams["percentage_silence"],
            "words_wanted": words_wanted,
            "skip_prep": hparams["skip_prep"],
        },
    )
    sb.utils.distributed.run_on_main(hparams["prepare_noise_data"])
    sb.utils.distributed.run_on_main(hparams["prepare_rir_data"])

    # Dataset IO prep: creating Dataset objects and proper encodings for phones
    train_data, valid_data, test_data, label_encoder = dataio_prep(hparams)

    # Brain class initialization
    speaker_brain = SpeakerBrain(
        modules=hparams["modules"],
        opt_class=hparams["opt_class"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )

    # with torch.autograd.detect_anomaly():
    # Training
    speaker_brain.fit(
        speaker_brain.hparams.epoch_counter,
        train_data,
        valid_data,
        train_loader_kwargs=hparams["dataloader_options"],
        valid_loader_kwargs=hparams["dataloader_options"],
    )

    # Load the best checkpoint for evaluation
    test_stats = speaker_brain.evaluate(
        test_set=test_data,
        min_key="ErrorRate",
        test_loader_kwargs=hparams["dataloader_options"],
    )

Writing train_lstm_encodec.py


### Run

In [None]:
# import warnings
# warnings.filterwarnings('ignore')

!rm -rf /content/results/encodec_rnn_v12/1986/

!python train_lstm_encodec.py hparams_lstm_encodec.yaml --data_folder=/path/to/GSC

config.json: 100% 809/809 [00:00<00:00, 5.47MB/s]
model.safetensors: 100% 93.1M/93.1M [00:00<00:00, 195MB/s]
  WeightNorm.apply(module, name, dim)
  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
speechbrain.lobes.models.huggingface_transformers.huggingface - EncodecModel is frozen.
huggingface_Encodec - Encodec is frozen.
speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: results/encodec_rnn_v12/1986
numexpr.utils - NumExpr defaulting to 2 threads.
prepare_GSC - Extracting speech_commands_v0.02.tar.gz...
Downloading https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1 to /path/to/GSC/noise/data.zip
noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1: 569MB [00:39, 14.3MB/s]               
Extracting /path/to/GSC/noise/data.zip to /path/to/GSC/noise
Downloading https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743

#### Visualization

**The graphs below depict that sequence based models will be an excellent choice since audio signals are technically sequential. We see that the model does not overfit on the data as well. The error rate also gradually decreases in an almost smooth curve. The final test error rate is 56% which is very good compared to Xvector.**

![image.png](attachment:a9e8ef2e-2f52-41e1-ac33-cac28be1e41c.png)

## hparams - GRU

**What is GRU? Why use it?**
**GRU is also a type of RNN as well. However, they are simpler in comparison to LSTMs with fewer gates. They are sequential and can be trained faster than LSTMs. They are robust in nature and can be tuned to give the best performance out of the lot. Same as LSTM, many parameters were tuned to enhance the output of GRUs. They are prone to overfitting so large amounts of data and a decent learning rate make for a good training recipe.**

In [None]:
%%file hparams_encodec.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/encodec_test_gru_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: facebook/encodec_24khz
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 20
batch_size: 32 #32 #128
lr: 0.00025 #0.0001
lr_ssl: 0.00001
hidden: 128 #128
linear_dim: 9600 #19200 # #38400
emb_size: 16
dropout: 0.5

sample_rate: 24000
shuffle: True

#freeze all ssl
freeze: True

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.encodec.Encodec
    source: !ref <sslmodel_hub>
    bandwidth: 12
    sample_rate: !ref <sample_rate>
    flat_embeddings: False
    freeze: !ref <freeze>
    renorm_embeddings: True
    save_path: !ref <sslmodel_folder>

gru: !new:speechbrain.nnet.RNN.GRU
        input_size: !ref <emb_size>
        bidirectional: False
        hidden_size: !ref <hidden>
        num_layers: 4
        dropout: !ref <dropout>
        # re_init: True

linear_1: !new:speechbrain.nnet.linear.Linear
        input_size: !ref <linear_dim>
        n_neurons: 1000

linear_2: !new:speechbrain.nnet.linear.Linear
        input_size: 1000
        n_neurons: !ref <out_n_neurons>

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    gru: !ref <gru>
    linear_1: !ref <linear_1>
    linear_2: !ref <linear_2>
    log_softmax: !ref <log_softmax>

model: !new:torch.nn.ModuleList
    - [!ref <ssl_model>, !ref <gru>, !ref <linear_1>, !ref <log_softmax>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Overwriting hparams_encodec.yaml


### train

In [None]:
%%file train_encodec.py

#!/usr/bin/python3
"""Recipe for training a classifier using the
Google Speech Commands v0.02 Dataset.

To run this recipe, use the following command:
> python train.py {hyperparameter_file}

Using your own hyperparameter file or one of the following:
    hyperparams/xvect.yaml (xvector system)

Author
    * Mirco Ravanelli 2020
    * Hwidong Na 2020
    * Nauman Dawalatabad 2020
    * Sarthak Yadav 2022
    Script adapted by David Raby-Pepin 2021
"""
import os
import sys
import torch
import torchaudio
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml

import speechbrain.nnet.CNN
from speechbrain.utils.distributed import run_on_main


class SpeakerBrain(sb.core.Brain):
    """Class for GSC training" """

    def compute_forward(self, batch, stage):
        """Computation pipeline based on a encoder + command classifier.
        Data augmentation and environmental corruption are applied to the
        input speech.
        """
        batch = batch.to(self.device)
        wavs, lens = batch.sig

        # Add waveform augmentation if specified.
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     wavs, lens = self.hparams.wav_augment(wavs, lens)

        # if isinstance(
        #     self.modules.compute_features, speechbrain.lobes.features.Leaf
        # ):
        #     # if leaf, first normalize the wavs before feeding them to leaf
        #     # no normalization is needed after LEAF
        #     feats = self.modules.mean_var_norm(wavs, lens)
        #     feats = self.modules.compute_features(feats)
        # else:
        #     # Feature extraction and normalization
        #     feats = self.modules.compute_features(wavs)
        #     feats = self.modules.mean_var_norm(feats, lens)

        tokens, embeddings = self.modules.ssl_model.encode(wavs, lens)
        tokens= tokens.float()
        outputs = self.hparams.mean_var_norm(tokens, lens)
        outputs, _ = self.modules.gru(outputs)
        outputs = outputs.reshape(outputs.shape[0], -1)
        outputs = self.modules.linear_1(outputs)
        outputs = self.modules.linear_2(outputs)

        outputs = self.hparams.log_softmax(outputs)
        return outputs, lens


    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss using command-id as label."""
        predictions, lens = predictions
        uttid = batch.id
        command, _ = batch.command_encoded

        # Concatenate labels (due to data augmentation)
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     command = self.hparams.wav_augment.replicate_labels(command)

        # compute the cost function
        command = command.squeeze(1)
        loss = self.hparams.compute_cost(predictions, command)
        # loss = sb.nnet.losses.nll_loss(predictions, command, lens)

        if hasattr(self.hparams.lr_annealing, "on_batch_end"):
            self.hparams.lr_annealing.on_batch_end(self.optimizer)

        if stage != sb.Stage.TRAIN:
            self.error_metrics.append(uttid, predictions, command)

        return loss

    def on_stage_start(self, stage, epoch=None):
        """Gets called at the beginning of an epoch."""
        if stage != sb.Stage.TRAIN:
            self.error_metrics = self.hparams.error_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        """Gets called at the end of an epoch."""
        # Compute/store important stats
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["ErrorRate"] = self.error_metrics.summarize("average")

        # Perform end-of-iteration things, like annealing, logging, etc.
        if stage == sb.Stage.VALID:
            old_lr, new_lr = self.hparams.lr_annealing(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)

            (
                old_lr_ssl,
                new_lr_ssl,
            ) = self.hparams.lr_annealing_ssl(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(
                self.ssl_optimizer, new_lr_ssl
            )

            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch, "lr": old_lr, "ssl_lr": old_lr_ssl},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"ErrorRate": stage_stats["ErrorRate"]},
                min_keys=["ErrorRate"],
            )

        # We also write statistics about test data to stdout and to the logfile.
        if stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                {"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stage_stats,
            )

    def init_optimizers(self):
            "Initializes the ssl optimizer and model optimizer"
            self.ssl_optimizer = self.hparams.ssl_opt_class(
                self.modules.ssl_model.parameters()
            )
            self.optimizer = self.hparams.opt_class(self.hparams.model.parameters())

            if self.checkpointer is not None:
                self.checkpointer.add_recoverable(
                    "ssl_opt", self.ssl_optimizer
                )
                self.checkpointer.add_recoverable("optimizer", self.optimizer)

            self.optimizers_dict = {
                "model_optimizer": self.optimizer,
                "ssl_optimizer": self.ssl_optimizer,
            }


def dataio_prep(hparams):
    "Creates the datasets and their data processing pipelines."

    data_folder = hparams["data_folder"]

    # 1. Declarations:
    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["train_annotation"],
        replacements={"data_root": data_folder},
    )

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["valid_annotation"],
        replacements={"data_root": data_folder},
    )

    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["test_annotation"],
        replacements={"data_root": data_folder},
    )

    datasets = [train_data, valid_data, test_data]
    label_encoder = sb.dataio.encoder.CategoricalEncoder()

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav", "start", "stop", "duration")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav, start, stop, duration):
        start = int(start)
        stop = int(stop)
        num_frames = stop - start
        sig, fs = torchaudio.load(
            wav, num_frames=num_frames, frame_offset=start
        )
        sig = torchaudio.transforms.Resample(fs, 24000)(sig)
        sig = sig.transpose(0, 1).squeeze(1)
        return sig

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("command")
    @sb.utils.data_pipeline.provides("command", "command_encoded")
    def label_pipeline(command):
        yield command
        command_encoded = label_encoder.encode_sequence_torch([command])
        yield command_encoded

    sb.dataio.dataset.add_dynamic_item(datasets, label_pipeline)

    # 3. Fit encoder:
    # Load or compute the label encoder (with multi-GPU DDP support)
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load_or_create(
        path=lab_enc_file,
        from_didatasets=[train_data],
        output_key="command",
    )

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets, ["id", "sig", "command_encoded"]
    )

    return train_data, valid_data, test_data, label_encoder


if __name__ == "__main__":
    # This flag enables the inbuilt cudnn auto-tuner
    torch.backends.cudnn.benchmark = True

    # CLI:
    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])

    # Initialize ddp (useful only for multi-GPU DDP training)
    sb.utils.distributed.ddp_init_group(run_opts)

    # Load hyperparameters file with command-line overrides
    with open(hparams_file) as fin:
        hparams = load_hyperpyyaml(fin, overrides)

    # Create experiment directory
    sb.core.create_experiment_directory(
        experiment_directory=hparams["output_folder"],
        hyperparams_to_save=hparams_file,
        overrides=overrides,
    )

    # Dataset prep (parsing GSC and annotation into csv files)
    from prepare_GSC import prepare_GSC

    # Known words for V2 12 and V2 35 sets
    if hparams["number_of_commands"] == 12:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
        ]
    elif hparams["number_of_commands"] == 35:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
            "zero",
            "one",
            "two",
            "three",
            "four",
            "five",
            "six",
            "seven",
            "eight",
            "nine",
            "bed",
            "bird",
            "cat",
            "dog",
            "happy",
            "house",
            "marvin",
            "sheila",
            "tree",
            "wow",
            "backward",
            "forward",
            "follow",
            "learn",
            "visual",
        ]
    else:
        raise ValueError("number_of_commands must be 12 or 35")

    # Data preparation
    run_on_main(
        prepare_GSC,
        kwargs={
            "data_folder": hparams["data_folder"],
            "save_folder": hparams["output_folder"],
            "validation_percentage": hparams["validation_percentage"],
            "testing_percentage": hparams["testing_percentage"],
            "percentage_unknown": hparams["percentage_unknown"],
            "percentage_silence": hparams["percentage_silence"],
            "words_wanted": words_wanted,
            "skip_prep": hparams["skip_prep"],
        },
    )
    sb.utils.distributed.run_on_main(hparams["prepare_noise_data"])
    sb.utils.distributed.run_on_main(hparams["prepare_rir_data"])

    # Dataset IO prep: creating Dataset objects and proper encodings for phones
    train_data, valid_data, test_data, label_encoder = dataio_prep(hparams)

    # Brain class initialization
    speaker_brain = SpeakerBrain(
        modules=hparams["modules"],
        opt_class=hparams["opt_class"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )

    # with torch.autograd.detect_anomaly():
    # Training
    speaker_brain.fit(
        speaker_brain.hparams.epoch_counter,
        train_data,
        valid_data,
        train_loader_kwargs=hparams["dataloader_options"],
        valid_loader_kwargs=hparams["dataloader_options"],
    )

    # Load the best checkpoint for evaluation
    test_stats = speaker_brain.evaluate(
        test_set=test_data,
        min_key="ErrorRate",
        test_loader_kwargs=hparams["dataloader_options"],
    )

Overwriting train_encodec.py


### Run

In [None]:
# import warnings
# warnings.filterwarnings('ignore')

# !rm -rf /content/results/encodec_gru_v12/1986/

!python train_encodec.py hparams_encodec.yaml --data_folder=/path/to/GSC

config.json: 100%|██████████████████████████████| 809/809 [00:00<00:00, 140kB/s]
model.safetensors: 100%|████████████████████| 93.1M/93.1M [00:00<00:00, 401MB/s]
  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
speechbrain.lobes.models.huggingface_transformers.huggingface - EncodecModel is frozen.
huggingface_Encodec - Encodec is frozen.
  warn(f"Failed to load image Python extension: {e}")
speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: results/encodec_final_final_gru_v12/1986
prepare_GSC - Extracting speech_commands_v0.02.tar.gz...
/path/to/GSC/noise/data.zip exists. Skipping download
/path/to/GSC/rir/data.zip exists. Skipping download
speechbrain.dataio.encoder - Load called, but CategoricalEncoder is not empty. Loaded data will overwrite everything. This is normal if there is e.g. an unk label defined at init.
speechbrain.core - Info: ckpt_interval_minutes arg from hparam file is used
spe

#### Visualization

**In the loss vs epoch graph, towards the end GRU starts overfitting the data. Nonetheless, the performance is similar to LSTM and was trained in lesser time. The error rate continuously decreases throughout the epochs as well. the final test error rate is aroung 58%.**

![image.png](attachment:0b4efe8a-5e61-46b2-a63e-c7af6490c064.png)

## hparams - transformers

**What are transformers? Why use it?**
**Transformers are another set of great models that deal with sequential data. However, they do not process it sequentially like RNNs and LSTMs. They process the entire sequence together, making them great at long term memory. Since audio data is a sequence, transformers conventioannly perform very well on them. I tried various different hyperparameters, but was not able to bring the true potential of transformers. I believe I am missing something in code. The following is my attempt at transformers for keyword spotting.**

In [None]:
%%file hparams_encodec.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/encodec_transformer_3_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: facebook/encodec_24khz
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 20
batch_size: 32
lr: 0.0001 # 0.0001
lr_ssl: 0.00001 # 0.00001
linear_dim: 1200 #3750
num_layers: 1
nhead: 2
d_ffn: 256

sample_rate: 24000
shuffle: True

#freeze all ssl
freeze: True # False

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.encodec.Encodec
    source: !ref <sslmodel_hub>
    bandwidth:  12.0
    sample_rate: !ref <sample_rate>
    flat_embeddings: False
    freeze: !ref <freeze>
    renorm_embeddings: True
    save_path: !ref <sslmodel_folder>

encoder: !new:speechbrain.lobes.models.transformer.Transformer.TransformerEncoder
         num_layers: !ref <num_layers>
         input_shape: (32, 75, 16)
         nhead: !ref <nhead>
         d_ffn: !ref <d_ffn>
        #  ffn_type: 'regularFFN'
        #  ffn_cnn_kernel_size_list: [3, 3]
         d_model: 16
         activation: !name:torch.nn.modules.activation.ReLU
        #  kdim: None
        #  vdim: None
         dropout: 0.4
         normalize_before: True


linear: !new:speechbrain.nnet.linear.Linear
        input_size: !ref <linear_dim>
        n_neurons: !ref <out_n_neurons>

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    encoder: !ref <encoder>
    linear: !ref <linear>
    log_softmax: !ref <log_softmax>

model: !new:torch.nn.ModuleList
    - [!ref <linear>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Overwriting hparams_encodec.yaml


### train

In [None]:
%%file train_encodec.py

#!/usr/bin/python3
"""Recipe for training a classifier using the
Google Speech Commands v0.02 Dataset.

To run this recipe, use the following command:
> python train.py {hyperparameter_file}

Using your own hyperparameter file or one of the following:
    hyperparams/xvect.yaml (xvector system)

Author
    * Mirco Ravanelli 2020
    * Hwidong Na 2020
    * Nauman Dawalatabad 2020
    * Sarthak Yadav 2022
    Script adapted by David Raby-Pepin 2021
"""
import os
import sys
import torch
import torchaudio
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml

import speechbrain.nnet.CNN
from speechbrain.utils.distributed import run_on_main


class SpeakerBrain(sb.core.Brain):
    """Class for GSC training" """

    def compute_forward(self, batch, stage):
        """Computation pipeline based on a encoder + command classifier.
        Data augmentation and environmental corruption are applied to the
        input speech.
        """
        batch = batch.to(self.device)
        wavs, lens = batch.sig

        # Add waveform augmentation if specified.
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     wavs, lens = self.hparams.wav_augment(wavs, lens)

        # if isinstance(
        #     self.modules.compute_features, speechbrain.lobes.features.Leaf
        # ):
        #     # if leaf, first normalize the wavs before feeding them to leaf
        #     # no normalization is needed after LEAF
        #     feats = self.modules.mean_var_norm(wavs, lens)
        #     feats = self.modules.compute_features(feats)
        # else:
        #     # Feature extraction and normalization
        #     feats = self.modules.compute_features(wavs)
        #     feats = self.modules.mean_var_norm(feats, lens)

        tokens, embeddings = self.modules.ssl_model.encode(wavs, lens)
        tokens = tokens.float()
        outputs = self.hparams.mean_var_norm(tokens, lens)
        outputs, _ = self.modules.encoder(outputs)
        outputs = outputs.reshape(outputs.shape[0], -1)
        outputs = self.modules.linear(outputs)
        outputs = self.hparams.log_softmax(outputs)
        return outputs, lens


    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss using command-id as label."""
        predictions, lens = predictions
        uttid = batch.id
        command, _ = batch.command_encoded

        # Concatenate labels (due to data augmentation)
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     command = self.hparams.wav_augment.replicate_labels(command)

        # compute the cost function
        command = command.squeeze(1)
        loss = self.hparams.compute_cost(predictions, command)

        if hasattr(self.hparams.lr_annealing, "on_batch_end"):
            self.hparams.lr_annealing.on_batch_end(self.optimizer)

        if stage != sb.Stage.TRAIN:
            self.error_metrics.append(uttid, predictions, command)

        return loss

    def on_stage_start(self, stage, epoch=None):
        """Gets called at the beginning of an epoch."""
        if stage != sb.Stage.TRAIN:
            self.error_metrics = self.hparams.error_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        """Gets called at the end of an epoch."""
        # Compute/store important stats
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["ErrorRate"] = self.error_metrics.summarize("average")

        # Perform end-of-iteration things, like annealing, logging, etc.
        if stage == sb.Stage.VALID:
            old_lr, new_lr = self.hparams.lr_annealing(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)

            (
                old_lr_ssl,
                new_lr_ssl,
            ) = self.hparams.lr_annealing_ssl(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(
                self.ssl_optimizer, new_lr_ssl
            )

            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch, "lr": old_lr, "ssl_lr": old_lr_ssl},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"ErrorRate": stage_stats["ErrorRate"]},
                min_keys=["ErrorRate"],
            )

        # We also write statistics about test data to stdout and to the logfile.
        if stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                {"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stage_stats,
            )

    def init_optimizers(self):
            "Initializes the ssl optimizer and model optimizer"
            self.ssl_optimizer = self.hparams.ssl_opt_class(
                self.modules.ssl_model.parameters()
            )
            self.optimizer = self.hparams.opt_class(self.hparams.model.parameters())

            if self.checkpointer is not None:
                self.checkpointer.add_recoverable(
                    "ssl_opt", self.ssl_optimizer
                )
                self.checkpointer.add_recoverable("optimizer", self.optimizer)

            self.optimizers_dict = {
                "model_optimizer": self.optimizer,
                "ssl_optimizer": self.ssl_optimizer,
            }


def dataio_prep(hparams):
    "Creates the datasets and their data processing pipelines."

    data_folder = hparams["data_folder"]

    # 1. Declarations:
    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["train_annotation"],
        replacements={"data_root": data_folder},
    )

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["valid_annotation"],
        replacements={"data_root": data_folder},
    )

    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["test_annotation"],
        replacements={"data_root": data_folder},
    )

    datasets = [train_data, valid_data, test_data]
    label_encoder = sb.dataio.encoder.CategoricalEncoder()

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav", "start", "stop", "duration")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav, start, stop, duration):
        start = int(start)
        stop = int(stop)
        num_frames = stop - start
        sig, fs = torchaudio.load(
            wav, num_frames=num_frames, frame_offset=start
        )
        sig = torchaudio.transforms.Resample(fs, 24000)(sig)
        sig = sig.transpose(0, 1).squeeze(1)

        return sig

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("command")
    @sb.utils.data_pipeline.provides("command", "command_encoded")
    def label_pipeline(command):
        yield command
        command_encoded = label_encoder.encode_sequence_torch([command])
        yield command_encoded

    sb.dataio.dataset.add_dynamic_item(datasets, label_pipeline)

    # 3. Fit encoder:
    # Load or compute the label encoder (with multi-GPU DDP support)
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load_or_create(
        path=lab_enc_file,
        from_didatasets=[train_data],
        output_key="command",
    )

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets, ["id", "sig", "command_encoded"]
    )

    return train_data, valid_data, test_data, label_encoder


if __name__ == "__main__":
    # This flag enables the inbuilt cudnn auto-tuner
    torch.backends.cudnn.benchmark = True

    # CLI:
    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])

    # Initialize ddp (useful only for multi-GPU DDP training)
    sb.utils.distributed.ddp_init_group(run_opts)

    # Load hyperparameters file with command-line overrides
    with open(hparams_file) as fin:
        hparams = load_hyperpyyaml(fin, overrides)

    # Create experiment directory
    sb.core.create_experiment_directory(
        experiment_directory=hparams["output_folder"],
        hyperparams_to_save=hparams_file,
        overrides=overrides,
    )

    # Dataset prep (parsing GSC and annotation into csv files)
    from prepare_GSC import prepare_GSC

    # Known words for V2 12 and V2 35 sets
    if hparams["number_of_commands"] == 12:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
        ]
    elif hparams["number_of_commands"] == 35:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
            "zero",
            "one",
            "two",
            "three",
            "four",
            "five",
            "six",
            "seven",
            "eight",
            "nine",
            "bed",
            "bird",
            "cat",
            "dog",
            "happy",
            "house",
            "marvin",
            "sheila",
            "tree",
            "wow",
            "backward",
            "forward",
            "follow",
            "learn",
            "visual",
        ]
    else:
        raise ValueError("number_of_commands must be 12 or 35")

    # Data preparation
    run_on_main(
        prepare_GSC,
        kwargs={
            "data_folder": hparams["data_folder"],
            "save_folder": hparams["output_folder"],
            "validation_percentage": hparams["validation_percentage"],
            "testing_percentage": hparams["testing_percentage"],
            "percentage_unknown": hparams["percentage_unknown"],
            "percentage_silence": hparams["percentage_silence"],
            "words_wanted": words_wanted,
            "skip_prep": hparams["skip_prep"],
        },
    )
    sb.utils.distributed.run_on_main(hparams["prepare_noise_data"])
    sb.utils.distributed.run_on_main(hparams["prepare_rir_data"])

    # Dataset IO prep: creating Dataset objects and proper encodings for phones
    train_data, valid_data, test_data, label_encoder = dataio_prep(hparams)

    # Brain class initialization
    speaker_brain = SpeakerBrain(
        modules=hparams["modules"],
        opt_class=hparams["opt_class"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )

    # with torch.autograd.detect_anomaly():
    # Training
    speaker_brain.fit(
        speaker_brain.hparams.epoch_counter,
        train_data,
        valid_data,
        train_loader_kwargs=hparams["dataloader_options"],
        valid_loader_kwargs=hparams["dataloader_options"],
    )

    # Load the best checkpoint for evaluation
    test_stats = speaker_brain.evaluate(
        test_set=test_data,
        min_key="ErrorRate",
        test_loader_kwargs=hparams["dataloader_options"],
    )

Overwriting train_encodec.py


### Run

In [None]:
import warnings
warnings.filterwarnings('ignore')

!rm -rf /content/results/encodec_transformer_v12/1986/

!python train_encodec.py hparams_encodec.yaml --data_folder=/path/to/GSC

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
speechbrain.lobes.models.huggingface_transformers.huggingface - EncodecModel is frozen.
huggingface_Encodec - Encodec is frozen.
  warn(f"Failed to load image Python extension: {e}")
speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: results/encodec_transformer_3_v12/1986
prepare_GSC - Extracting speech_commands_v0.02.tar.gz...
/path/to/GSC/noise/data.zip exists. Skipping download
/path/to/GSC/rir/data.zip exists. Skipping download
speechbrain.dataio.encoder - Load called, but CategoricalEncoder is not empty. Loaded data will overwrite everything. This is normal if there is e.g. an unk label defined at init.
speechbrain.core - Info: ckpt_interval_minutes arg from hparam file is used
speechbrain.core - Gradscaler enabled: False. Using precision: fp32.
speechbrain.core - SpeakerBrain Model Statistics:
* Total Number of Trainable Parameters: 24.1k
* 

#### Visualization

**As mentioned earlier, the model heavily overfits the data. This is most proabably because the Transformer model is overbearing for a small and simple dataset like the google speech command dataset. The error rate also does not see a huge dip, stationing itself at about 86%. The final test error rate is 85%.**

![image.png](attachment:6039770e-2fd9-42ea-a9f0-59955caaf635.png)

## LSTM VS GRU VS XVECTOR VS TRANSFORMER

**The below graph depicts how the loss decreases/increases throughout the epochs. Clearly, LSTM and GRU perform great versus Xvector and Transformer. They all saturate near the 20 epocch mark.**

![image.png](attachment:19b45f77-112f-4b41-981c-d8d8313b0583.png)

**Likewise, the grpah below depicts the error rates for all the models. Following the same fashion LSTM leads the race, reaching rates of 55%.**

![image.png](attachment:aa8c1f5f-7c9b-48be-b507-f4b24211e4fa.png)

## hparams - CRDNN

**I tried to apply CRDNN on the Encodec features. I could not produce desired results while training. I am making some elementary errors. The below is my attempt at CRDNN.**

In [None]:
%%file hparams_encodec.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/encodec_la_crdnn_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_hub: facebook/encodec_24khz
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 5
batch_size: 32
lr: 0.0001 # 0.0001
lr_ssl: 0.00001 # 0.00001
linear_dim: 1024 #3750
# rnn_class: <class 'speechbrain.nnet.RNN.LiGRU'>

sample_rate: 24000
shuffle: True

#freeze all ssl
freeze: True # False

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.huggingface_transformers.encodec.Encodec
    source: !ref <sslmodel_hub>
    bandwidth:  12.0
    sample_rate: !ref <sample_rate>
    flat_embeddings: False
    freeze: !ref <freeze>
    # renorm_embeddings: True
    save_path: !ref <sslmodel_folder>

encoder: !new:speechbrain.lobes.models.CRDNN.CRDNN
         input_size: 16
         activation: !name:torch.nn.modules.activation.LeakyReLU
         rnn_class: !name:speechbrain.nnet.RNN.LSTM
         rnn_neurons: 128
         rnn_layers: 2
         dropout: 0.5
         rnn_bidirectional: False
         input_shape: (32, 75, 16)
         cnn_blocks: 2
         cnn_channels: [128, 256]
         cnn_kernelsize: (3, 3)
         time_pooling: False
         time_pooling_size: 2
         freq_pooling_size: 2
         inter_layer_pooling_size: [2, 2]
         using_2d_pooling: False
         rnn_re_init: False
         dnn_blocks: 2
         dnn_neurons: 512
         projection_dim: -1
         use_rnnp: False

linear_1: !new:speechbrain.nnet.linear.Linear
        input_size: !ref <linear_dim>
        n_neurons: 256

linear_2: !new:speechbrain.nnet.linear.Linear
        input_size: 256
        n_neurons: !ref <out_n_neurons>

avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
    return_std: True

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    encoder: !ref <encoder>
    linear_1: !ref <linear_1>
    linear_2: !ref <linear_2>
    log_softmax: !ref <log_softmax>

model: !new:torch.nn.ModuleList
    - [!ref <linear_1>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Overwriting hparams_encodec.yaml


### train

In [None]:
%%file train_encodec.py

#!/usr/bin/python3
"""Recipe for training a classifier using the
Google Speech Commands v0.02 Dataset.

To run this recipe, use the following command:
> python train.py {hyperparameter_file}

Using your own hyperparameter file or one of the following:
    hyperparams/xvect.yaml (xvector system)

Author
    * Mirco Ravanelli 2020
    * Hwidong Na 2020
    * Nauman Dawalatabad 2020
    * Sarthak Yadav 2022
    Script adapted by David Raby-Pepin 2021
"""
import os
import sys
import torch
import torchaudio
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml

import speechbrain.nnet.CNN
from speechbrain.utils.distributed import run_on_main


class SpeakerBrain(sb.core.Brain):
    """Class for GSC training" """

    def compute_forward(self, batch, stage):
        """Computation pipeline based on a encoder + command classifier.
        Data augmentation and environmental corruption are applied to the
        input speech.
        """
        batch = batch.to(self.device)
        wavs, lens = batch.sig

        # Add waveform augmentation if specified.
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     wavs, lens = self.hparams.wav_augment(wavs, lens)

        # if isinstance(
        #     self.modules.compute_features, speechbrain.lobes.features.Leaf
        # ):
        #     # if leaf, first normalize the wavs before feeding them to leaf
        #     # no normalization is needed after LEAF
        #     feats = self.modules.mean_var_norm(wavs, lens)
        #     feats = self.modules.compute_features(feats)
        # else:
        #     # Feature extraction and normalization
        #     feats = self.modules.compute_features(wavs)
        #     feats = self.modules.mean_var_norm(feats, lens)

        # We use Encodec as a discrete feature extractor on the raw waveforms. We obtain
        # tokens(discrete features) and embedding(continuous features)
        tokens, embeddings = self.modules.ssl_model.encode(wavs, lens)
        tokens = tokens.float()
        outputs = self.hparams.mean_var_norm(tokens, lens)
        outputs = self.modules.encoder(outputs)
        outputs = self.hparams.avg_pool(outputs, lens)
        outputs = outputs.reshape(outputs.shape[0], -1)
        outputs = self.modules.linear_1(outputs)
        outputs = self.modules.linear_2(outputs)
        outputs = self.hparams.log_softmax(outputs)
        return outputs, lens


    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss using command-id as label."""
        predictions, lens = predictions
        uttid = batch.id
        command, _ = batch.command_encoded

        # Concatenate labels (due to data augmentation)
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     command = self.hparams.wav_augment.replicate_labels(command)

        # compute the cost function
        command = command.squeeze(1)
        loss = self.hparams.compute_cost(predictions, command)

        if hasattr(self.hparams.lr_annealing, "on_batch_end"):
            self.hparams.lr_annealing.on_batch_end(self.optimizer)

        if stage != sb.Stage.TRAIN:
            self.error_metrics.append(uttid, predictions, command)

        return loss

    def on_stage_start(self, stage, epoch=None):
        """Gets called at the beginning of an epoch."""
        if stage != sb.Stage.TRAIN:
            self.error_metrics = self.hparams.error_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        """Gets called at the end of an epoch."""
        # Compute/store important stats
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["ErrorRate"] = self.error_metrics.summarize("average")

        # Perform end-of-iteration things, like annealing, logging, etc.
        if stage == sb.Stage.VALID:
            old_lr, new_lr = self.hparams.lr_annealing(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)

            (
                old_lr_ssl,
                new_lr_ssl,
            ) = self.hparams.lr_annealing_ssl(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(
                self.ssl_optimizer, new_lr_ssl
            )

            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch, "lr": old_lr, "ssl_lr": old_lr_ssl},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"ErrorRate": stage_stats["ErrorRate"]},
                min_keys=["ErrorRate"],
            )

        # We also write statistics about test data to stdout and to the logfile.
        if stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                {"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stage_stats,
            )

    def init_optimizers(self):
            "Initializes the ssl optimizer and model optimizer"
            self.ssl_optimizer = self.hparams.ssl_opt_class(
                self.modules.ssl_model.parameters()
            )
            self.optimizer = self.hparams.opt_class(self.hparams.model.parameters())

            if self.checkpointer is not None:
                self.checkpointer.add_recoverable(
                    "ssl_opt", self.ssl_optimizer
                )
                self.checkpointer.add_recoverable("optimizer", self.optimizer)

            self.optimizers_dict = {
                "model_optimizer": self.optimizer,
                "ssl_optimizer": self.ssl_optimizer,
            }


def dataio_prep(hparams):
    "Creates the datasets and their data processing pipelines."

    data_folder = hparams["data_folder"]

    # 1. Declarations:
    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["train_annotation"],
        replacements={"data_root": data_folder},
    )

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["valid_annotation"],
        replacements={"data_root": data_folder},
    )

    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["test_annotation"],
        replacements={"data_root": data_folder},
    )

    datasets = [train_data, valid_data, test_data]
    label_encoder = sb.dataio.encoder.CategoricalEncoder()

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav", "start", "stop", "duration")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav, start, stop, duration, target_sr=24000):
        start = int(start)
        stop = int(stop)
        num_frames = stop - start
        sig, fs = torchaudio.load(
            wav, num_frames=num_frames, frame_offset=start
        )
        # sig = torchaudio.transforms.Resample(fs, 24000)(sig)
        sig = sig.transpose(0, 1).squeeze(1)

        return sig

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("command")
    @sb.utils.data_pipeline.provides("command", "command_encoded")
    def label_pipeline(command):
        yield command
        command_encoded = label_encoder.encode_sequence_torch([command])
        yield command_encoded

    sb.dataio.dataset.add_dynamic_item(datasets, label_pipeline)

    # 3. Fit encoder:
    # Load or compute the label encoder (with multi-GPU DDP support)
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load_or_create(
        path=lab_enc_file,
        from_didatasets=[train_data],
        output_key="command",
    )

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets, ["id", "sig", "command_encoded"]
    )

    return train_data, valid_data, test_data, label_encoder


if __name__ == "__main__":
    # This flag enables the inbuilt cudnn auto-tuner
    torch.backends.cudnn.benchmark = True

    # CLI:
    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])

    # Initialize ddp (useful only for multi-GPU DDP training)
    sb.utils.distributed.ddp_init_group(run_opts)

    # Load hyperparameters file with command-line overrides
    with open(hparams_file) as fin:
        hparams = load_hyperpyyaml(fin, overrides)

    # Create experiment directory
    sb.core.create_experiment_directory(
        experiment_directory=hparams["output_folder"],
        hyperparams_to_save=hparams_file,
        overrides=overrides,
    )

    # Dataset prep (parsing GSC and annotation into csv files)
    from prepare_GSC import prepare_GSC

    # Known words for V2 12 and V2 35 sets
    if hparams["number_of_commands"] == 12:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
        ]
    elif hparams["number_of_commands"] == 35:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
            "zero",
            "one",
            "two",
            "three",
            "four",
            "five",
            "six",
            "seven",
            "eight",
            "nine",
            "bed",
            "bird",
            "cat",
            "dog",
            "happy",
            "house",
            "marvin",
            "sheila",
            "tree",
            "wow",
            "backward",
            "forward",
            "follow",
            "learn",
            "visual",
        ]
    else:
        raise ValueError("number_of_commands must be 12 or 35")

    # Data preparation
    run_on_main(
        prepare_GSC,
        kwargs={
            "data_folder": hparams["data_folder"],
            "save_folder": hparams["output_folder"],
            "validation_percentage": hparams["validation_percentage"],
            "testing_percentage": hparams["testing_percentage"],
            "percentage_unknown": hparams["percentage_unknown"],
            "percentage_silence": hparams["percentage_silence"],
            "words_wanted": words_wanted,
            "skip_prep": hparams["skip_prep"],
        },
    )
    sb.utils.distributed.run_on_main(hparams["prepare_noise_data"])
    sb.utils.distributed.run_on_main(hparams["prepare_rir_data"])

    # Dataset IO prep: creating Dataset objects and proper encodings for phones
    train_data, valid_data, test_data, label_encoder = dataio_prep(hparams)

    # Brain class initialization
    speaker_brain = SpeakerBrain(
        modules=hparams["modules"],
        opt_class=hparams["opt_class"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )

    # with torch.autograd.detect_anomaly():
    # Training
    speaker_brain.fit(
        speaker_brain.hparams.epoch_counter,
        train_data,
        valid_data,
        train_loader_kwargs=hparams["dataloader_options"],
        valid_loader_kwargs=hparams["dataloader_options"],
    )

    # Load the best checkpoint for evaluation
    test_stats = speaker_brain.evaluate(
        test_set=test_data,
        min_key="ErrorRate",
        test_loader_kwargs=hparams["dataloader_options"],
    )

Overwriting train_encodec.py


### Run

In [None]:
# import warnings
# warnings.filterwarnings('ignore')

# !rm -rf /content/results/encodec_crdnn_v12/1986/

!python train_encodec.py hparams_encodec.yaml --data_folder=/path/to/GSC

config.json: 100%|██████████████████████████████| 809/809 [00:00<00:00, 120kB/s]
model.safetensors: 100%|████████████████████| 93.1M/93.1M [00:00<00:00, 342MB/s]
  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
speechbrain.lobes.models.huggingface_transformers.huggingface - EncodecModel is frozen.
huggingface_Encodec - Encodec is frozen.
  warn(f"Failed to load image Python extension: {e}")
speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: results/encodec_la_crdnn_v12/1986
prepare_GSC - Extracting speech_commands_v0.02.tar.gz...
/path/to/GSC/noise/data.zip exists. Skipping download
/path/to/GSC/rir/data.zip exists. Skipping download
speechbrain.dataio.encoder - Load called, but CategoricalEncoder is not empty. Loaded data will overwrite everything. This is normal if there is e.g. an unk label defined at init.
speechbrain.core - Info: ckpt_interval_minutes arg from hparam file is used
speechbrai

# DAC

### fine-tuning and freezing the supervised feature weights yield in the same performance

## hparams - not loading the pre-trained weights

In [None]:
%%file hparams_dac.yaml

# ################################
# Model: Classification with xvector
# Authors: Hwidong Na & Mirco Ravanelli
#          Script adapted by David Raby-Pepin 2021
# ################################

# Basic parameters
seed: 1986
__set_seed: !apply:torch.manual_seed [!ref <seed>]

# Use 12 for V2 12 task and 35 for V2 35 task
number_of_commands: 12
output_folder: !ref results/dac_v<number_of_commands>/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data for augmentation
NOISE_DATASET_URL: https://www.dropbox.com/scl/fi/a09pj97s5ifan81dqhi4n/noises.zip?rlkey=j8b0n9kdjdr32o1f06t0cw5b7&dl=1
RIR_DATASET_URL: https://www.dropbox.com/scl/fi/linhy77c36mu10965a836/RIRs.zip?rlkey=pg9cu8vrpn2u173vhiqyu743u&dl=1

# Data files
data_folder: !PLACEHOLDER  # e.g. /path/to/GSC
data_folder_noise: !ref <data_folder>/noise # The noisy sequences for data augmentation will automatically be downloaded here.
data_folder_rir: !ref <data_folder>/rir # The impulse responses used for data augmentation will automatically be downloaded here.
train_annotation: !ref <output_folder>/train.csv
valid_annotation: !ref <output_folder>/valid.csv
test_annotation: !ref <output_folder>/test.csv
noise_annotation: !ref <save_folder>/noise.csv
rir_annotation: !ref <save_folder>/rir.csv

# URL for the ssl model, you can change to benchmark diffrenet models
# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
# This allow you to have ~4% improvment
sslmodel_folder: !ref <save_folder>/ssl_checkpoint

# Percentage of files used for validation and test
validation_percentage: 10
testing_percentage: 10

# Percentage of unknown and silence examples
# (relative to total of known word samples) to include
percentage_unknown: 10 # Set this to 0 for the V2 35 task
percentage_silence: 10 # Set this to 0 for the V2 35 task

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 5
batch_size: 32
lr: 0.0001
lr_ssl: 0.00001
encoder_dim: 31
load_pretrained: False

sample_rate: 16000
shuffle: True

# Number of classes (i.e. different commands)
out_n_neurons: !ref <number_of_commands>  #includes core commands & auxiliary words

num_workers: 2
dataloader_options:
    batch_size: !ref <batch_size>
    shuffle: !ref <shuffle>
    num_workers: !ref <num_workers>

# Functions
ssl_model: !new:speechbrain.lobes.models.discrete.dac.DAC
    load_pretrained: !ref <load_pretrained>
    sample_rate: None
    model_type: "16KHz"
    quantizer_dropout: False
    # save_path: !ref <sslmodel_folder>

avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
    return_std: False

output_mlp: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <encoder_dim>
    n_neurons: !ref <out_n_neurons>
    bias: False

epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs>


# Download and prepare the dataset of noisy sequences for augmentation
prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <NOISE_DATASET_URL>
    dest_folder: !ref <data_folder_noise>
    ext: wav
    csv_file: !ref <noise_annotation>

# Add noise to input signal
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>


# Download and prepare the dataset of room impulse responses for augmentation
prepare_rir_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL
    URL: !ref <RIR_DATASET_URL>
    dest_folder: !ref <data_folder_rir>
    ext: wav
    csv_file: !ref <rir_annotation>

# Add reverberation to input signal
add_reverb: !new:speechbrain.augment.time_domain.AddReverb
    csv_file: !ref <rir_annotation>
    reverb_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

# Time drop: randomly drops a number of temporal chunks.
drop_chunk_count_low: 1  # Min number of audio chunks to drop
drop_chunk_count_high: 5  # Max number of audio chunks to drop
drop_chunk_length_low: 1000  # Min length of audio chunks to drop
drop_chunk_length_high: 2000  # Max length of audio chunks to drop

drop_chunk: !new:speechbrain.augment.time_domain.DropChunk
    drop_length_low: !ref <drop_chunk_length_low>
    drop_length_high: !ref <drop_chunk_length_high>
    drop_count_low: !ref <drop_chunk_count_low>
    drop_count_high: !ref <drop_chunk_count_high>

# Augmenter: Combines previously defined augmentations to perform data augmentation
wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: True
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <add_reverb>,
        !ref <drop_freq>,
        !ref <drop_chunk>]

mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False

log_softmax: !new:speechbrain.nnet.activations.Softmax
    apply_log: True

modules:
    ssl_model: !ref <ssl_model>
    output_mlp: !ref <output_mlp>

model: !new:torch.nn.ModuleList
    - [!ref <output_mlp>]

# Cost + optimization
compute_cost: !name:speechbrain.nnet.losses.nll_loss
# compute_error: !name:speechbrain.nnet.losses.classification_error

opt_class: !name:torch.optim.Adam
    lr: !ref <lr>
    weight_decay: 0.000002

ssl_opt_class: !name:torch.optim.Adam
    lr: !ref <lr_ssl>

lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr>
    improvement_threshold: 0.0025
    annealing_factor: 0.9
    patient: 0

lr_annealing_ssl: !new:speechbrain.nnet.schedulers.NewBobScheduler
    initial_value: !ref <lr_ssl>
    improvement_threshold: 0.0025
    annealing_factor: 0.9

# Logging + checkpoints
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        model: !ref <model>
        ssl_model: !ref <ssl_model>
        lr_annealing: !ref <lr_annealing>
        lr_annealing_ssl: !ref <lr_annealing_ssl>
        counter: !ref <epoch_counter>

Overwriting hparams_dac.yaml


### train

In [None]:
%%file train_dac.py

#!/usr/bin/python3
"""Recipe for training a classifier using the
Google Speech Commands v0.02 Dataset.

To run this recipe, use the following command:
> python train.py {hyperparameter_file}

Using your own hyperparameter file or one of the following:
    hyperparams/xvect.yaml (xvector system)

Author
    * Mirco Ravanelli 2020
    * Hwidong Na 2020
    * Nauman Dawalatabad 2020
    * Sarthak Yadav 2022
    Script adapted by David Raby-Pepin 2021
"""
import os
import sys
import torch
import torchaudio
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml

import speechbrain.nnet.CNN
from speechbrain.utils.distributed import run_on_main


class SpeakerBrain(sb.core.Brain):
    """Class for GSC training" """

    def compute_forward(self, batch, stage):
        """Computation pipeline based on a encoder + command classifier.
        Data augmentation and environmental corruption are applied to the
        input speech.
        """
        batch = batch.to(self.device)
        wavs, lens = batch.sig

        # Add waveform augmentation if specified.
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     wavs, lens = self.hparams.wav_augment(wavs, lens)

        # if isinstance(
        #     self.modules.compute_features, speechbrain.lobes.features.Leaf
        # ):
        #     # if leaf, first normalize the wavs before feeding them to leaf
        #     # no normalization is needed after LEAF
        #     feats = self.modules.mean_var_norm(wavs, lens)
        #     feats = self.modules.compute_features(feats)
        # else:
        #     # Feature extraction and normalization
        #     feats = self.modules.compute_features(wavs)
        #     feats = self.modules.mean_var_norm(feats, lens)
        outputs = self.modules.ssl_model.encode(wavs.unsqueeze(1))
        codes = outputs[1].float()

        # last dim will be used for AdaptativeAVG pool
        outputs = self.hparams.avg_pool(codes, lens)
        outputs = outputs.view(outputs.shape[0], -1)

        outputs = self.modules.output_mlp(outputs)
        outputs = self.hparams.log_softmax(outputs)
        return outputs, lens


    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss using command-id as label."""
        predictions, lens = predictions
        uttid = batch.id
        command, _ = batch.command_encoded

        # Concatenate labels (due to data augmentation)
        # if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
        #     command = self.hparams.wav_augment.replicate_labels(command)

        # compute the cost function
        command = command.squeeze(1)
        loss = self.hparams.compute_cost(predictions, command)

        if hasattr(self.hparams.lr_annealing, "on_batch_end"):
            self.hparams.lr_annealing.on_batch_end(self.optimizer)

        if stage != sb.Stage.TRAIN:
            self.error_metrics.append(uttid, predictions, command)

        return loss

    def on_stage_start(self, stage, epoch=None):
        """Gets called at the beginning of an epoch."""
        if stage != sb.Stage.TRAIN:
            self.error_metrics = self.hparams.error_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        """Gets called at the end of an epoch."""
        # Compute/store important stats
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["ErrorRate"] = self.error_metrics.summarize("average")

        # Perform end-of-iteration things, like annealing, logging, etc.
        if stage == sb.Stage.VALID:
            old_lr, new_lr = self.hparams.lr_annealing(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)

            (
                old_lr_ssl,
                new_lr_ssl,
            ) = self.hparams.lr_annealing_ssl(stage_stats["ErrorRate"])
            sb.nnet.schedulers.update_learning_rate(
                self.ssl_optimizer, new_lr_ssl
            )

            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch, "lr": old_lr, "ssl_lr": old_lr_ssl},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"ErrorRate": stage_stats["ErrorRate"]},
                min_keys=["ErrorRate"],
            )

        # We also write statistics about test data to stdout and to the logfile.
        if stage == sb.Stage.TEST:
            self.hparams.train_logger.log_stats(
                {"Epoch loaded": self.hparams.epoch_counter.current},
                test_stats=stage_stats,
            )

    def init_optimizers(self):
            "Initializes the ssl optimizer and model optimizer"
            self.ssl_optimizer = self.hparams.ssl_opt_class(
                self.modules.ssl_model.parameters()
            )
            self.optimizer = self.hparams.opt_class(self.hparams.model.parameters())

            if self.checkpointer is not None:
                self.checkpointer.add_recoverable(
                    "ssl_opt", self.ssl_optimizer
                )
                self.checkpointer.add_recoverable("optimizer", self.optimizer)

            self.optimizers_dict = {
                "model_optimizer": self.optimizer,
                "ssl_optimizer": self.ssl_optimizer,
            }


def dataio_prep(hparams):
    "Creates the datasets and their data processing pipelines."

    data_folder = hparams["data_folder"]

    # 1. Declarations:
    train_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["train_annotation"],
        replacements={"data_root": data_folder},
    )

    valid_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["valid_annotation"],
        replacements={"data_root": data_folder},
    )

    test_data = sb.dataio.dataset.DynamicItemDataset.from_csv(
        csv_path=hparams["test_annotation"],
        replacements={"data_root": data_folder},
    )

    datasets = [train_data, valid_data, test_data]
    label_encoder = sb.dataio.encoder.CategoricalEncoder()

    # 2. Define audio pipeline:
    @sb.utils.data_pipeline.takes("wav", "start", "stop", "duration")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav, start, stop, duration):
        start = int(start)
        stop = int(stop)
        num_frames = stop - start
        sig, fs = torchaudio.load(
            wav, num_frames=num_frames, frame_offset=start
        )
        sig = sig.transpose(0, 1).squeeze(1)
        return sig

    sb.dataio.dataset.add_dynamic_item(datasets, audio_pipeline)

    # 3. Define text pipeline:
    @sb.utils.data_pipeline.takes("command")
    @sb.utils.data_pipeline.provides("command", "command_encoded")
    def label_pipeline(command):
        yield command
        command_encoded = label_encoder.encode_sequence_torch([command])
        yield command_encoded

    sb.dataio.dataset.add_dynamic_item(datasets, label_pipeline)

    # 3. Fit encoder:
    # Load or compute the label encoder (with multi-GPU DDP support)
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load_or_create(
        path=lab_enc_file,
        from_didatasets=[train_data],
        output_key="command",
    )

    # 4. Set output:
    sb.dataio.dataset.set_output_keys(
        datasets, ["id", "sig", "command_encoded"]
    )

    return train_data, valid_data, test_data, label_encoder


if __name__ == "__main__":
    # This flag enables the inbuilt cudnn auto-tuner
    torch.backends.cudnn.benchmark = True

    # CLI:
    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])

    # Initialize ddp (useful only for multi-GPU DDP training)
    sb.utils.distributed.ddp_init_group(run_opts)

    # Load hyperparameters file with command-line overrides
    with open(hparams_file) as fin:
        hparams = load_hyperpyyaml(fin, overrides)

    # Create experiment directory
    sb.core.create_experiment_directory(
        experiment_directory=hparams["output_folder"],
        hyperparams_to_save=hparams_file,
        overrides=overrides,
    )

    # Dataset prep (parsing GSC and annotation into csv files)
    from prepare_GSC import prepare_GSC

    # Known words for V2 12 and V2 35 sets
    if hparams["number_of_commands"] == 12:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
        ]
    elif hparams["number_of_commands"] == 35:
        words_wanted = [
            "yes",
            "no",
            "up",
            "down",
            "left",
            "right",
            "on",
            "off",
            "stop",
            "go",
            "zero",
            "one",
            "two",
            "three",
            "four",
            "five",
            "six",
            "seven",
            "eight",
            "nine",
            "bed",
            "bird",
            "cat",
            "dog",
            "happy",
            "house",
            "marvin",
            "sheila",
            "tree",
            "wow",
            "backward",
            "forward",
            "follow",
            "learn",
            "visual",
        ]
    else:
        raise ValueError("number_of_commands must be 12 or 35")

    # Data preparation
    run_on_main(
        prepare_GSC,
        kwargs={
            "data_folder": hparams["data_folder"],
            "save_folder": hparams["output_folder"],
            "validation_percentage": hparams["validation_percentage"],
            "testing_percentage": hparams["testing_percentage"],
            "percentage_unknown": hparams["percentage_unknown"],
            "percentage_silence": hparams["percentage_silence"],
            "words_wanted": words_wanted,
            "skip_prep": hparams["skip_prep"],
        },
    )
    sb.utils.distributed.run_on_main(hparams["prepare_noise_data"])
    sb.utils.distributed.run_on_main(hparams["prepare_rir_data"])

    # Dataset IO prep: creating Dataset objects and proper encodings for phones
    train_data, valid_data, test_data, label_encoder = dataio_prep(hparams)

    # Brain class initialization
    speaker_brain = SpeakerBrain(
        modules=hparams["modules"],
        opt_class=hparams["opt_class"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )

    # with torch.autograd.detect_anomaly():
    # Training
    speaker_brain.fit(
        speaker_brain.hparams.epoch_counter,
        train_data,
        valid_data,
        train_loader_kwargs=hparams["dataloader_options"],
        valid_loader_kwargs=hparams["dataloader_options"],
    )

    # Load the best checkpoint for evaluation
    test_stats = speaker_brain.evaluate(
        test_set=test_data,
        min_key="ErrorRate",
        test_loader_kwargs=hparams["dataloader_options"],
    )

Overwriting train_dac.py


### Run

**The other discrete feature extractor I tried was DAC. However, the results were very bad even after trying all hyperparameter tunings. Below is a glimpse of the results for every combination of encoders and linear layers. It does not reduce at all even after multiple epochs elapse. I believe I made a mistake with extracting the right features.**

In [None]:
!rm -rf /content/results/dac_v12/1986/

!python train_dac.py hparams_dac.yaml --data_folder=/path/to/GSC

  if ismodule(module) and hasattr(module, '__file__'):
speechbrain.core - Beginning experiment!
speechbrain.core - Experiment folder: results/dac_v12/1986
numexpr.utils - NumExpr defaulting to 2 threads.
prepare_GSC - Extracting speech_commands_v0.02.tar.gz...
/path/to/GSC/noise/data.zip exists. Skipping download
/path/to/GSC/rir/data.zip exists. Skipping download
speechbrain.dataio.encoder - Load called, but CategoricalEncoder is not empty. Loaded data will overwrite everything. This is normal if there is e.g. an unk label defined at init.
speechbrain.core - Info: ckpt_interval_minutes arg from hparam file is used
speechbrain.core - Gradscaler enabled: False. Using precision: fp32.
speechbrain.core - SpeakerBrain Model Statistics:
* Total Number of Trainable Parameters: 76.7M
* Total Number of Parameters: 76.7M
* Trainable Parameters represent 100.0000% of the total size.
speechbrain.utils.checkpoints - Would load a checkpoint here, but none found yet.
speechbrain.utils.epoch_loop - G

# CONCLUSION

![image.png](attachment:bac796ae-3589-4e5d-8af0-a11724aa346b.png)

**For the discrete features, RNNs are the best performing. Mixed models like CRDNN can indeed give better results. Since audio signals are sequential, transformers and RNN based models are the best bet for now with discrete features. As we can see in the above graph, continuous features still outperform discrete features. This has obvious reasons as continuous features are real valued vectors that are built around the task itself. On the other hand, discrete features target categorical aspects of input data such as phenomes or words, in speech recognition. They generalize more and are compact in nature. They greatly reduce the compute, making them resource efficient. This in turn can be helpful in many ways. They are paramount to bridging the gap between audio and NLP. In the coming future, the hope is that better discrete feature extractors are born, bringing us one step closer to AGI.**