## Installing Dependencies

In [3]:
############################################################################
###################### importing needed packages ###########################
############################################################################

# basics
import os # to handle ocal files
os.environ["KALDI_ROOT"] = "/home/azureuser"
print(os.environ["KALDI_ROOT"])
import time
import string
import numpy as np
from pathlib import Path
import pandas as pd
import glob # unix style pathname pattern expansion
import json
import requests # to handle apis
from datetime import datetime as dt # timers 
import IPython # interactive components

import matplotlib.pyplot as plt # plots
%matplotlib inline

# sound
#import soundfile as sf
#import librosa # sound utility 
#import librosa.display # plots
from matplotlib.pyplot import specgram # plots

# azure
import azureml.core, os
from azureml.core import Workspace, Datastore, Dataset, Run
from azureml.data.datapath  import DataPath
from azureml.data.data_reference import DataReference
from azureml.exceptions import UserErrorException

# load subscription info from config.json
ws = Workspace.from_config()

# to be able to write on mounted datastore
import fuse

# deep learning
import torch

# audio
import torchaudio
import speechbrain as sb

/home/azureuser


In [4]:
user = os.popen('whoami').read()[:-1]
#mount_path must be empty
mount_path   = "/home/azureuser/cloudfiles/code/Users/neelan/asr/pipeline/data" #📌 must change for user 
if not os.path.exists(mount_path): os.mkdir(mount_path)
if os.listdir(mount_path) != []:
    print("mount_path not empty")

#make sure user has write access to cache_path - if not, create & chown to your user.
cache_path   = "/home/azureuser/cloudfiles/data/tmp"
if not os.path.exists(cache_path): os.mkdir(cache_path)

config_path  = "/home/azureuser/cloudfiles/code/Users/neelan/asr/pipeline/connection.cfg"
if not os.path.exists(cache_path):
    print("get connection file from https://github.com/elucidate-ai/asr/blob/main/pipeline/connection.cfg")

mount_path not empty


In [5]:
###########################################################
######## Running BLOBFUSE to mount azure storage ##########
###########################################################

# wildcard version not working for some reason 🤷🏾‍♂️
#!blobfuse $mount_path --tmp_path=$cache_path --config-file=$config_path
#!blobfuse /home/azureuser/cloudfiles/data/mount --tmp-path=/home/azureuser/cloudfiles/data/tmp  --config-file=/home/azureuser/cloudfiles/code/Users/neelan/asr/pipeline/connection.cfg

!blobfuse /home/azureuser/cloudfiles/code/Users/neelan/asr/pipeline/data --tmp-path=/home/azureuser/cloudfiles/data/tmp  --config-file=/home/azureuser/cloudfiles/code/Users/neelan/asr/pipeline/connection.cfg

fuse: mountpoint is not empty
fuse: if you are sure this is safe, use the 'nonempty' mount option


In [6]:



def get_aud(txt_file):
    """

    Helper function to return full audio file path from full text path
    Args:
    - txt_file: full path .txt 

    Returns:
    - aud_file: path to audio file

    """
    aud_file = txt_file.replace('assembly_ai/','').replace('.txt','')

    return aud_file

def get_text(aud_file, location):
    """

    Helper function to return text transcription path from aud path
    Args:
    - aud_file: full path .wav 

    Returns:
    - txt_file: path to text transcriptio file

    """
    aud_file = aud_file.replace('.wav','.wav.txt').replace(mount_path,location)

    return aud_file

def read_txt(file_):
    """

    Helper function to read .txt file

    Args:
    - file_: .txt file that results from assembly_ai


    Returns:
    - file_contents: rendering of file
    """
    f = open(file_, 'r')
    file_contents = f.read()
    return file_contents

In [17]:
################################################
############### Environment variables ##########
################################################

FILES=[]
MOUNTS=[]
NAMES=[]
FILE_PATHS=[]
AUD = '.wav' #audio format used



for f in os.listdir(mount_path):
    if AUD not in f: continue
    if '.txt' in f: continue
    
    path = os.path.join(mount_path,f)
    FILES.append(path)
    MOUNTS.append(f)
    NAMES.append(str(f))



#TOKEN ='291224eea5ab48a4ae5b3a6f1ba92250'  #neelan@elucidate.ai
#TOKEN = '7fb7f8b14ee545a3858fdd57a40b968e' #neelan.pather@gmail.com
#TOKEN = '0bac60bee5814d0693361b0d901f1ad2'  #2006601@students.wits.ac.za
TOKEN = 'fbd02b4fef004227ae6c2af290cc4820'  #marom@elucidate.ai

CHUNK_SIZE = 32 # chosen because concurrency limit for assembly api processing 32 files
MOUNT_CHUNKS = [MOUNTS[x:x+CHUNK_SIZE] for x in range(0, len(MOUNTS), CHUNK_SIZE)]
FILE_CHUNKS = [FILES[x:x+CHUNK_SIZE] for x in range(0, len(FILES), CHUNK_SIZE)]
NAME_CHUNKS = [NAMES[x:x+CHUNK_SIZE] for x in range(0, len(NAMES), CHUNK_SIZE)]

MOUNT_PATH = mount_path
SAVE_PATH = os.path.join(mount_path, 'assembly_ai')
if not os.path.exists(SAVE_PATH): os.mkdir(SAVE_PATH)

SAVE_PATH_0 = SAVE_PATH + '_0'

# save in list if no output
NONE = []

# testing write capacity of mount
ASSEMBLY_OUT = SAVE_PATH
if not os.path.exists(ASSEMBLY_OUT):
    os.mkdir(ASSEMBLY_OUT)



else:
    print("\nfound exisiting directory")


# Checking for exisitng transcriptions

ASSEMBLY_TXT = glob.glob1(ASSEMBLY_OUT,"*.txt")
if ASSEMBLY_TXT != []:
    print(f"\nfound {len(os.listdir(ASSEMBLY_OUT))} exisiting transcripts")

    orig_aud = [i.replace('.txt','') for i in ASSEMBLY_TXT]

    """

    AFRICAN_TXT = []
    ENGLISH_TXT = []
    lowENGLISH_TXT = []
    # transcript == '' is an african language --> we have spotted checked this 
    # (there are also low English transcripts where bulk of file is African language)
    for file_ in ASSEMBLY_TXT:
        if read_txt(os.path.join(ASSEMBLY_OUT,file_)) == '':
            AFRICAN_TXT.append(os.path.join(ASSEMBLY_OUT,file_))
        else:
            ENGLISH_TXT.append(os.path.join(ASSEMBLY_OUT,file_))

    print(f"\n{len(ENGLISH_TXT)} english transcripts")
    print(f"\n{len(AFRICAN_TXT)} blank transcripts (incorrect language match)")
    print(f"\n{len(lowENGLISH_TXT)} low english transcripts")
    """

# testing write capacity of mount
SB_OUT = os.path.join(mount_path, 'sb', 'pretrained')
if not os.path.exists(SB_OUT):
    os.mkdir(SB_OUT)
    print(f"\nmade {SB_OUT}")
else:
    print("\nfound exisiting directory")

SB_MODEL = os.path.join(mount_path, 'sb', 'test')
if not os.path.exists(SB_MODEL):
    os.mkdir(SB_MODEL)
    print(f"\nmade {SB_MODEL}")
else:
    print("\nfound exisiting directory")


# Checking for exisitng transcriptions

SB_TXT = glob.glob1(SB_OUT,"*.txt")
if SB_TXT != []:
    print("\nfound exisiting transcripts")


found exisiting directory

found 2142 exisiting transcripts

found exisiting directory

made /home/azureuser/cloudfiles/code/Users/neelan/asr/pipeline/data/sb/test


### Automatic Speech Recognition

We can validate this result by looking at the oracle transcription for this utterance.

In [8]:
#!head ./LibriSpeech/dev-clean-2/1272/135031/1272-135031.trans.txt
read_txt(get_text(FILES[0], SAVE_PATH_0))

'GOOD DAY CAN I PLEASE SPEAK TO HALEYSA NIM TAMBO SPEAKING OKAY MAAM YOURE SPEAKING TO MBDLIN AND COLLECTIONS PLEASE NOTE THAT OUR CALLS ARE RECORDED FOR QUALITY AND TRAINING PURPOSES TO ENSURE AM I SPEAKING TO THE CORRECT PERSON CAN YOU PLEASE CONFIRM YOUR DATE OF BIRTH SOMEONE YES OKAY BEFORE WE TALK ABOUT ANYTHING MAAM I JUST NEED TO FOLLOW UP PROTOCOL YOUR ID NUMBER STARTS WITH 93010 SIX CORRECT YES OKAY AS I CAN SEE ON MY SYSTEM THAT YOU OWNER MOST OF YOUR ARRANGEMENT YOU OWNED YOUR ARRANGEMENT THAT WAS MADE ON THE 1 OCTOBER YOU WERE ABLE TO PAY 100 BUT THE ONE PAID ON THE 1 NOVEMBER IT WAS REJECTED YES SO YOU ARE SAYING THAT YOU PAID ALL THE DEBT THAT YOU OWE BECAUSE THE OUTSTANDING BALANCE ON MY SYSTEM IS 593558 SO I CANNOT MAKE ANY PAYMENT TOWARDS MBT BECAUSE THEY SAVED THE OTHER ACCOUNT BECAUSE THIS ONE IS MR PRICE OKAY LET ME CHECK MY NOTES DID YOU REQUEST FOR THE LETTER MAAM I DID YOU GAVE THE PERSON YOU WERE TALKING TO YOUR EMAIL ADDRESS YES OKAY WHEN WAS THAT WHEN DID YOU 

# Model


[source](https://colab.research.google.com/drive/1LN7R3U3xneDgDRK2gC5MzGkLysCWxuC3?usp=sharing#scrollTo=79ryKiGHinQ3)

# Pretrained Models and Fine-Tuning with 🤗

Training DNN models is often very time-consuming and expensive. 
For this reason, whenever it is possible, using off-the-shelf pretrained models can be convenient in various scenarios. 

In SpeechBrain we provide pre-trained models and we also encourage users to share their own using 🤗[HuggingFace Hub](https://huggingface.co/models)🤗 as we strongly believe that sharing models can help research. 

You can browse our official pre-trained models [here](https://huggingface.co/speechbrain). 

If you have a pre-trained model and want to include it among the official ones, please consider opening a pull request on [GitHub](https://github.com/speechbrain/speechbrain/blob/develop/README.md) with all the details of your model!

We provide a simple and straightforward way to download and instantiate a state-of-the-art pretrained-model and use it either for direct inference or for fine-tuning/knowledge distillation or whatever new fancy technique you can come up with!

With this tutorial, you will learn how to:

1. Use pretrained models to infer on your data.
2. Use pretrained models as a component of a new pipeline (e.g language models, finetuning, speaker embeddings extraction ...

## Prerequisites
- [SpeechBrain Introduction](https://colab.research.google.com/drive/12bg3aUdr9mTfOGqcB5pSMABoIKPgiwcM?usp=sharing)
- [YAML tutorial](https://colab.research.google.com/drive/1Pg9by4b6-8QD2iC0U7Ic3Vxq4GEwEdDz?usp=sharing)
- [Brain Class tutorial](https://colab.research.google.com/drive/1fdqTk4CTXNcrcSVFvaOKzRfLmj4fJfwa?usp=sharing)
- [DataIOBasics](https://colab.research.google.com/drive/1AiVJZhZKwEI4nFGANKXEe-ffZFfvXKwH)


`asr_model.transcribe_file(FILES[0])` results in:

`OSError: [Errno 95] Operation not supported: '/home/azureuser/cloudfiles/data/mount/0291506962#-10505#SITHATIL#TCRCBD-E45#20220215135504102.wav' -> '0291506962#-10505#SITHATIL#TCRCBD-E45#20220215135504102.wav'` is a symlink related error

seems you to be related to file stream from blob --> copy file to location of model before transcription

In [18]:
from speechbrain.pretrained import EncoderDecoderASR
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-crdnn-rnnlm-librispeech", savedir=SB_MODEL)


#asr_model.transcribe_file("./LibriSpeech/dev-clean-2/1272/135031/1272-135031-0003.flac")

# sb asr_model seemingly cannot performance inference directly from mount

def make_local(mount_file):
    #print(f"loading {mount_file}")
    y, sr = librosa.load(mount_file)
    print("loaded")
    sf.write('local.wav', y, sr)
    print("local.wav written")
    
    
asr_model.mods.keys()

odict_keys(['normalizer', 'encoder', 'decoder', 'lm_model'])

These keys corresponds to the modules entry specified in the [hyperparameter file](https://huggingface.co/speechbrain/asr-crdnn-rnnlm-librispeech/blob/main/hyperparams.yaml):


```yaml
modules:
    encoder: !ref <encoder>
    decoder: !ref <decoder>
    lm_model: !ref <lm_model>
```

We can also see that the encoder is actually composed of several sub-modules:

```yaml
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
    input_shape: [null, null, !ref <n_mels>]
    compute_features: !ref <compute_features>
    normalize: !ref <normalize>
    model: !ref <enc>
```



These are simply accessible as members of the encoder:

In [8]:
asr_model.mods.encoder

LengthsCapableSequential(
  (compute_features): Fbank(
    (compute_STFT): STFT()
    (compute_fbanks): Filterbank()
    (compute_deltas): Deltas()
    (context_window): ContextWindow()
  )
  (normalize): InputNormalization()
  (model): CRDNN(
    (CNN): Sequential(
      (block_0): CNN_Block(
        (conv_1): Conv2d(
          (conv): Conv2d(1, 128, kernel_size=(3, 3), stride=(1, 1))
        )
        (norm_1): LayerNorm(
          (norm): LayerNorm((40, 128), eps=1e-05, elementwise_affine=True)
        )
        (act_1): LeakyReLU(negative_slope=0.01)
        (conv_2): Conv2d(
          (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1))
        )
        (norm_2): LayerNorm(
          (norm): LayerNorm((40, 128), eps=1e-05, elementwise_affine=True)
        )
        (act_2): LeakyReLU(negative_slope=0.01)
        (pooling): Pooling1d(
          (pool_layer): MaxPool2d(kernel_size=(1, 2), stride=(1, 2), padding=(0, 0), dilation=(1, 1), ceil_mode=False)
        )
        (dro

In [9]:
asr_model.mods.encoder.compute_features

Fbank(
  (compute_STFT): STFT()
  (compute_fbanks): Filterbank()
  (compute_deltas): Deltas()
  (context_window): ContextWindow()
)

The training hyperparameters also can be easily accessed: 

In [10]:
dir(asr_model.hparams)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'activation',
 'asr_model',
 'beam_size',
 'blank_index',
 'bos_index',
 'cnn_blocks',
 'cnn_channels',
 'cnn_kernelsize',
 'compute_features',
 'coverage_penalty',
 'ctc_lin',
 'dec',
 'dec_neurons',
 'decoder',
 'dnn_blocks',
 'dnn_neurons',
 'dropout',
 'emb',
 'emb_size',
 'enc',
 'encoder',
 'eos_index',
 'eos_threshold',
 'inter_layer_pooling_size',
 'lm_model',
 'lm_weight',
 'log_softmax',
 'max_attn_shift',
 'max_decode_ratio',
 'min_decode_ratio',
 'modules',
 'n_fft',
 'n_mels',
 'normalizer',
 'output_neurons',
 'pretrainer',
 'rnn_bidirectional',
 'rnn_class',
 'rnn_layers',
 'rnn_neurons',
 'sample_rate',
 'seq_lin',
 'temperature',
 'tem

### Setting up the data pipeline

In [11]:
txt_files = []
wav_files  = []
for file_ in glob.glob(SAVE_PATH_0 + "/*.wav.txt", recursive=True):
    if read_txt(file_) != '':
        txt_files.append(file_)
        wav_files.append(file_.replace('/assembly_ai_0','').replace('.txt',''))

In [12]:

def asr_parse_to_json(txt_files, wav_files):

    """
    json produced is fed into speechbrain.IO
    """


    # in this dataset files names are spk_id-chapter_id-utterance_id.flac
    # we build a dictionary with words for each utterance
    words_dict = {}
    for txtf in txt_files:
        with open(txtf, "r") as f:
            lines = f.readlines()
        utt_id = txtf.replace(SAVE_PATH_0, "").replace(".wav","").replace(".txt", "").replace('/','')

        for l in lines:
            l = l.strip("\n")
            #utt_id = l.split(" ")[0]
            words = " ".join(l.split(" ")[1:])
            words_dict[utt_id] = words

    # we now build JSON examples
    examples = {}
    for file_ in wav_files:

        # define data file features
        id_ = file_.replace(MOUNT_PATH, "").replace(".wav","").replace(".txt", "").replace('/','')
        words_write = words_dict[id_]
        # blank training data can produce no supervision
        if words_write == '' : continue
        if words_write == None: continue
        spkID = file_.replace(os.path.dirname(file_),'').replace('.wav','')
        spkID = ''.join([i for i in spkID if not i.isdigit()]).replace('/','').replace('-','').replace('##','#')
        word_count = len(words_write.split())
        duration_seconds =  torchaudio.info(file_).num_frames / torchaudio.info(file_).sample_rate
        words_per_second = word_count/duration_seconds
       
        examples[id_] = {"file_path": file_,
                            "words": words_write,
                            "word_count": word_count,
                            "spkID": spkID,
                            "bits_per_sample": torchaudio.info(file_).bits_per_sample,
                            "encoding": torchaudio.info(file_).encoding,
                            "num_channels": torchaudio.info(file_).num_channels,
                            "num_frames": torchaudio.info(file_).num_frames,
                            "sample_rate": torchaudio.info(file_).sample_rate,
                            "duration_seconds": duration_seconds,
                            "words_per_second": words_per_second              
                            }

    pd.DataFrame(examples).transpose().to_csv('data.csv')                     
    with open("data.json", "w") as f:
        json.dump(examples, f, indent=4)

#asr_parse_to_json(txt_files, wav_files)

In [None]:
os.chdir()

In [14]:
#from parse_data import parse_to_json # parse_data is a local library downloaded before (see Installing Dependencies step) 
#parse_to_json("./LibriSpeech")
#f_ = open ('data_libre.json', "r")
#json.load(f_)

In [15]:
#f = open ('data.json', "r")
#json.load(f)


We instantiate a **DynamicItemDataset** from the JSON annotation

We sort the dataset based on length to speed-up training

[`speechbrain.dataio.dataset.DynamicItemDataset.filtered_sorted(key_min_value={}, key_max_value={}, key_test={}, sort_key=None, reverse=False, select_n=None)`](https://speechbrain.readthedocs.io/en/latest/API/speechbrain.dataio.dataset.html#speechbrain.dataio.dataset.DynamicItemDataset.filtered_sorted)

and add a pipeline for reading audio

and another one to encode the words from annotation.

It is worth noting that we use the Tokenizer object obtained from the pretrained `asr_model` and  that we encode the words with `asr_model.tokenizer.encode_as_ids(words)`. We also reuse `asr_model` `eos_index` and `bos_index` accessed via `asr_model.hparams` to ensure that all these parameters correspond to the ones used at pretraining time! 

In [22]:
os.chdir('/home/azureuser/cloudfiles/code/Users/neelan/asr/pipeline/')

In [23]:
data_info = pd.read_csv('data.csv', header=0).sort_values('duration_seconds', ascending=True)[:100]
data_info[['duration_seconds','word_count', 'words_per_second']].describe(include='all')


Unnamed: 0,duration_seconds,word_count,words_per_second
count,100.0,100.0,100.0
mean,52.1534,56.68,1.099736
std,9.722543,31.494069,0.601464
min,23.3,1.0,0.016989
25%,46.06,36.0,0.70931
50%,53.74,57.0,1.143706
75%,58.86,76.25,1.536412
max,63.98,165.0,2.618026


In [24]:
data_libre_info = pd.read_csv('data_libre.csv', header=0).sort_values('duration_seconds', ascending=False)[:100]
data_libre_info[['duration_seconds','word_count', 'words_per_second']].describe(include='all')

Unnamed: 0,duration_seconds,word_count,words_per_second
count,100.0,100.0,100.0
mean,16.667,44.55,2.677766
std,3.917199,11.721839,0.372265
min,12.715,26.0,1.881225
25%,13.92625,36.75,2.383565
50%,15.5425,42.0,2.716792
75%,17.66,51.0,2.896563
max,31.7,88.0,3.551008


In [24]:
# 3. Define text pipeline:
@sb.utils.data_pipeline.takes("words")
@sb.utils.data_pipeline.provides(
        "words", "tokens_list", "tokens_bos", "tokens_eos", "tokens")
def text_pipeline(words):
      yield words
      tokens_list = asr_model.tokenizer.encode_as_ids(words)
      yield tokens_list
      tokens_bos = torch.LongTensor([asr_model.hparams.bos_index] + (tokens_list))
      yield tokens_bos
      tokens_eos = torch.LongTensor(tokens_list + [asr_model.hparams.eos_index]) # we use same eos and bos indexes as in pretrained model
      yield tokens_eos
      tokens = torch.LongTensor(tokens_list)
      yield tokens

# build text pipeline object
from speechbrain.dataio.dataset import DynamicItemDataset
dataset = DynamicItemDataset.from_json("data.json")

#dataset = dataset.filtered_sorted(sort_key="duration_seconds", reverse=False, select_n=100)
#dataset = dataset.filtered_sorted(key_min_value={"word_count": 30}, key_max_value={"word_count": 50})
#dataset = dataset.filtered_sorted(key_min_value={"word_count": 10}, key_max_value={"word_count": 30},select_n=100)

dataset = dataset.filtered_sorted(key_min_value={"duration_seconds": 0}, key_max_value={"duration_seconds": 70})






dataset.add_dynamic_item(sb.dataio.dataio.read_audio, takes="file_path", provides="signal")
dataset.add_dynamic_item(text_pipeline)
dataset.set_output_keys(["id","duration_seconds", "word_count", "words_per_second", "signal", "words", "tokens_list", "tokens_bos", "tokens_eos", "tokens"])
len(dataset)


131

dataset_libre = DynamicItemDataset.from_json("data_libre.json")
dataset_libre = dataset_libre.filtered_sorted(sort_key="duration_seconds", reverse=True, select_n=100)
#dataset = dataset.filtered_sorted(key_min_value={"word_count": 10}, key_max_value={"word_count": 50})
#dataset = dataset.filtered_sorted(key_min_value={"word_count": 10}, key_max_value={"word_count": 30},select_n=100)
#dataset = dataset.filtered_sorted(key_min_value={"duration_seconds": 5}, key_max_value={"duration_seconds": 60},select_n=100)



dataset_libre.add_dynamic_item(sb.dataio.dataio.read_audio, takes="file_path", provides="signal")
dataset_libre.add_dynamic_item(text_pipeline)
dataset_libre.set_output_keys(["id","duration_seconds","word_count", "signal", "words", "tokens_list", "tokens_bos", "tokens_eos", "tokens"])
len(dataset_libre)


dataset_libre = DynamicItemDataset.from_json("data_libre.json")
dataset_libre = dataset_libre.filtered_sorted(sort_key="duration_seconds", reverse=True, select_n=100)
dataset_libre.add_dynamic_item(sb.dataio.dataio.read_audio, takes="file_path", provides="signal")
dataset_libre.add_dynamic_item(text_pipeline)
dataset_libre.set_output_keys(["id","duration_seconds", "signal", "words", "tokens_list", "tokens_bos", "tokens_eos", "tokens"])
dataset_libre[0]

We set the dataset object to return the signal tensor as well as the encoded tokens and words. 

### Fine-Tuning the ASR model

First, We define our Brain class that will perform the fine-tuning. Here, we just take an example similar to the Brain class of the original [Seq2Seq LibriSpeech recipe](https://github.com/speechbrain/speechbrain/blob/develop/recipes/LibriSpeech/ASR/seq2seq/train.py).


In [25]:
from speechbrain.lobes.features import Fbank

# Define fine-tuning procedure 
class EncDecFineTune(sb.Brain):

    

    def on_stage_start(self, stage, epoch):
        # enable grad for all modules we want to fine-tune
        
        if stage == sb.Stage.TRAIN:
            for module in [self.modules.enc, self.modules.emb, self.modules.dec, self.modules.seq_lin]:
                for p in module.parameters():
                    p.requires_grad = True
     
    def compute_forward(self, batch, stage):
        """Forward computations from the waveform batches to the output probabilities."""
        self.device = 'cuda'
        
        batch = batch.to(self.device)
        wavs, wav_lens = batch.signal
        tokens_bos, _ = batch.tokens_bos
        wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)

        # Forward pass
        feats = self.modules.compute_features(wavs)
        feats = self.modules.normalize(feats, wav_lens)
        #feats.requires_grad = True
        x = self.modules.enc(feats)
        
        e_in = self.modules.emb(tokens_bos)  # y_in bos + tokens
        h, _ = self.modules.dec(e_in, x, wav_lens)

        # Output layer for seq2seq log-probabilities
        logits = self.modules.seq_lin(h)
        p_seq = self.hparams.log_softmax(logits)

        return p_seq, wav_lens

    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss (CTC+NLL) given predictions and targets."""
        self.device = 'cuda'
        
        p_seq, wav_lens = predictions

        ids = batch.id
        tokens_eos, tokens_eos_lens = batch.tokens_eos
        tokens, tokens_lens = batch.tokens

        loss = self.hparams.seq_cost(
            p_seq, tokens_eos, tokens_eos_lens)
        

        return loss

    def fit_batch(self, batch):
        """Train the parameters given a single batch in input"""
        self.device = 'cuda'
        predictions = self.compute_forward(batch, sb.Stage.TRAIN)
        loss = self.compute_objectives(predictions, batch, sb.Stage.TRAIN)
        loss.backward()
        if self.check_gradients(loss):
            self.optimizer.step()
        self.optimizer.zero_grad()
        return loss.detach()


Here we define the modules and hyperparameters needed for the Brain class defined before.

We fetch them directly from the pretrained model by accessing its `modules` and `hparams`. These can be found in the `hyperparams.yaml` file in the model [HuggingFace repo](https://huggingface.co/speechbrain/asr-crdnn-rnnlm-librispeech/blob/main/hyperparams.yaml).

In [26]:
torch.cuda.is_available()

True

In [27]:

modules = {"enc": asr_model.mods.encoder.model, 
           "emb": asr_model.hparams.emb,
           "dec": asr_model.hparams.dec,
           "compute_features": asr_model.mods.encoder.compute_features, # we use the same features 
           "normalize": asr_model.mods.encoder.normalize,
           "seq_lin": asr_model.hparams.seq_lin, 
           
          }

hparams = {"seq_cost": lambda x, y, z: sb.nnet.losses.nll_loss(x, y, z, label_smoothing = 0.1),
            "log_softmax": sb.nnet.activations.Softmax(apply_log=True), "device" : "cuda"}

brain = EncDecFineTune(modules, hparams=hparams, opt_class=lambda x: torch.optim.SGD(x, 1e-5))
brain.tokenizer = asr_model.tokenizer


### Checking CUDA setting

In [28]:
brain.device

'cuda'

The pre-trained model can be finally fine-tuned:

In [29]:
asr_model.device

'cuda'

In [30]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.device(0))
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))


True
0
<torch.cuda.device object at 0x7fc12d8ef520>
1
Tesla K80


In [31]:
!nvidia-smi

Tue Apr  5 09:06:25 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   45C    P0    70W / 149W |   1554MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

### Training

[`DynamicItemDataset.filtered_sorted(key_min_value={}, key_max_value={}, key_test={}, sort_key=None, reverse=False, select_n=None)`](https://speechbrain.readthedocs.io/en/latest/API/speechbrain.dataio.dataset.html#speechbrain.dataio.dataset.DynamicItemDataset.filtered_sorted)

In [32]:
brain.fit(range(100), train_set=dataset, 
          train_loader_kwargs={"batch_size": 2, "drop_last":True, "shuffle": False})

 57%|█████▋    | 37/65 [07:53<06:15, 13.39s/it, train_loss=5.07]

In [66]:
!nvidia-smi

Tue Apr  5 06:49:34 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00000001:00:00.0 Off |                    0 |
| N/A   51C    P0    71W / 149W |  11306MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [38]:

!sudo kill -9 14419 // sudo kill -9 32291

In [32]:
make_local(FILES[1000])

loaded
local.wav written


In [37]:
FILES[1000].replace('.wav', '.wav.txt').replace('data/', 'data/assembly_ai/')

'/home/azureuser/cloudfiles/code/Users/neelan/asr/pipeline/data/assembly_ai/10101141039929#760#ROXANNES#TCRDBN1-SKRM100#20220215152351881.wav.txt'

In [38]:
read_txt(get_text(FILES[1000], SAVE_PATH_0))

'HELLO HELLO HI CAN I PLEASE SPEAK TO MR T S GABE SPEAKING OKAY HOW ARE YOU SIR FINE AND YOU IM GOOD THANKS YOU SPEAK SPEAKING TO ROXANNE AND IM CALLING YOU REGARDING YOUR TRUEEST ACCOUNT OKAY ALL RIGHT SO IM JUST CALLING TO CONFIRM YOUR PAYMENT ARRANGEMENT FOR THE MONTH OF FEBRUARY OKAY YES MAAM SO JUST BEFORE WE GO ON WITH THE CALL IS YOUR BIRTHDAY 5 JULY 1994 YES MAAM THANK YOU PLEASE NOTE THE CALLS ARE BEING RECORDED SIR SO MR GAIBI CAN YOU TELL ME FOR THIS MONTH WHATS THE DATE YOULL BE MAKING YOUR PAYMENT NEXT 20TH ALL RIGHT AND HOW MUCH WILL THAT BE FOR YOU HAVE THIS BALANCE OF ONLY 267 A MONTH THE WHOLE THING NEXT FRIDAY YES MAAM ALL RIGHT 267 THE WHOLE BALANCE SO ONCE YOU PAY THIS AMOUNT YOUR NAME IS GOING TO BE CLEARED AND YOULL BE ABLE TO START USING YOUR ACCOUNT AGAIN OKAY YEAH SO WHAT WE ARE ABLE TO DO FOR YOU ALSO IF THIS IS POSSIBLE IF YOU CAN PLEASE LEAVE THE FUNDS IN YOUR BANK ACCOUNT WE CAN ALSO ARRANGE THIS PAYMENT TO BE DONE AS A ONCE OFF DEBIT ORDER IF YOU DO A ONE 

In [43]:
FILES[1000]

'/home/azureuser/cloudfiles/code/Users/neelan/asr/pipeline/data/10101141039929#760#ROXANNES#TCRDBN1-SKRM100#20220215152351881.wav'

In [44]:
y, sr = torchaudio.load(FILES[1000])

IPython.display.Audio(y, rate=sr)

In [39]:
asr_model.transcribe_file('local.wav')

"HALLOO AYE I CANNOT BE SPEAK TO THE CITY AS TO AH DAVID ISN'T IT ISN'T IT THIS NEW I'LL KNOW HOW ARE YOU SAID BURGLARS I'M GOOD THANKS ARE YOU SPEAKING CHUCK SAND AND I'M TELLING YOU REGARDING YOUR TRUE AT THE COUNT"

## Pretrainer Class
In speechbrain, another way to perform pre-training is to use the PreTrainer Class (`speechbrain.utils.parameter_transfer.Pretrainer`). It orchestrates parameter transfer in a more structured way, which can aid in writing easy-to-share recipes (and it is also central in the implementation `speechbrain.pretrained` models). To use it, let's first initialize a model:

In [None]:
from speechbrain.lobes.models.ECAPA_TDNN import ECAPA_TDNN 

model = ECAPA_TDNN(input_size= 80,
                   channels= [1024, 1024, 1024, 1024, 3072],
                   kernel_sizes= [5, 3, 3, 3, 1],
                   dilations= [1, 2, 3, 4, 1],
                   attention_channels= 128,
                   lin_neurons = 192)

At this level, the model is initialized with random parameters. However, we can use our pretrainer to replace random parameters with the ones stored in the saved checkpoint:

In [None]:
from speechbrain.utils.parameter_transfer import Pretrainer

# Initialization of the pre-trainer 
pretrain = Pretrainer(loadables={'model': model}, paths={'model': 'speechbrain/spkrec-ecapa-voxceleb/embedding_model.ckpt'})

# We download the pretrained model from HuggingFace in this case
pretrain.collect_files()
pretrain.load_collected(device='cpu')

Now, the model is not anymore randomly initialized, but it contains the pre-trained parameters of `embedding_model.ckpt`.  The path of the pre-trained model can be a **local path**, a **web url**, or a **huggingface repository**:

In [None]:
# Local Path
pretrain = Pretrainer(collect_in='model_local', loadables={'model': model}, paths={'model': 'model_checkpoints/model.ckpt'})
pretrain.collect_files()
pretrain.load_collected(device='cpu')

# Or web 
pretrain = Pretrainer(collect_in='model_web', loadables={'model': model}, paths={'model': 'https://www.dropbox.com/s/2mdnl784ram5w8o/embedding_model.ckpt?dl=1'})
pretrain.collect_files()
pretrain.load_collected(device='cpu')

As you can see, you can use the variable `collect_in` to set where the pre-trained model is stored.

# Acknowledgements


*   Many thanks to ([ziz19](https://github.com/ziz19)) who helped improving this Tutorial. 


# **About SpeechBrain**
- Website: https://speechbrain.github.io/
- Code: https://github.com/speechbrain/speechbrain/
- HuggingFace: https://huggingface.co/speechbrain/


# **Citing SpeechBrain**
Please, cite SpeechBrain if you use it for your research or business.

```bibtex
@misc{speechbrain,
  title={SpeechBrain: A General-Purpose Speech Toolkit},
  author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio},
  year={2021},
  eprint={2106.04624},
  archivePrefix={arXiv},
  primaryClass={eess.AS}
}
```