In [None]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect
"""
# If you're using Google Colab and not running locally, run this cell.

## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install unidecode
!pip install matplotlib>=3.3.2

## Install NeMo
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

## Grab the config we'll use in this example
!mkdir configs
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/config.yaml

"""
Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!
Alternatively, you can uncomment the exit() below to crash and restart the kernel, in the case
that you want to use the "Run All Cells" (or similar) option.
"""
# exit()

In [1]:
# This is where the an4/ directory will be placed.
# Change this if you don't want the data to be extracted in the current directory.
data_dir = '.'


In [3]:
import glob
import os
import subprocess
import tarfile
import wget
data_dir = os.path.abspath(data_dir)
# Download the dataset. This will take a few moments...
print("******")
if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):
    an4_url = 'https://dldata-public.s3.us-east-2.amazonaws.com/an4_sphere.tar.gz'
    an4_path = wget.download(an4_url, data_dir)
    print(f"Dataset downloaded at: {an4_path}")
else:
    print("Tarfile already exists.")
    an4_path = data_dir + '/an4_sphere.tar.gz'

    

if not os.path.exists(data_dir + '/an4/'):
    # Untar and convert .sph to .wav (using sox)
    tar = tarfile.open(an4_path)
    tar.extractall(path=data_dir)

    print("Converting .sph to .wav...")
    sph_list = glob.glob(data_dir + '/an4/**/*.sph', recursive=True)
    for sph_path in sph_list:
        wav_path = sph_path[:-4] + '.wav'
        #converting to 16kHz wav
        cmd = ["sox", sph_path, "-r", "16000", wav_path]
        subprocess.run(cmd)
print("Finished conversion.\n******")

******
Dataset downloaded at: /home/jbalam/nemo/clone3/NeMo/tutorials/asr/an4_sphere.tar.gz
Converting .sph to .wav...
Finished conversion.
******


In [25]:
# --- Building Manifest Files --- #
import json
import librosa

# Function to build a manifest
def build_manifest(transcripts_path, manifest_path, wav_path):
    with open(transcripts_path, 'r') as fin:
        with open(manifest_path, 'w') as fout:
            for line in fin:
                # Lines look like this:
                # <s> transcript </s> (fileID)
                transcript = line[: line.find('(')-1].lower()
                transcript = transcript.replace('<s>', '').replace('</s>', '')
                transcript = transcript.strip()

                file_id = line[line.find('(')+1 : -2]  # e.g. "cen4-fash-b"
                audio_path = os.path.join(
                    data_dir, wav_path,
                    file_id[file_id.find('-')+1 : file_id.rfind('-')],
                    file_id + '.wav')

                duration = librosa.core.get_duration(filename=audio_path)

                # Write the metadata to the manifest
                metadata = {
                    "audio_filepath": audio_path,
                    "duration": duration,
                    "text": transcript
                }
                json.dump(metadata, fout)
                fout.write('\n')
                
# Building Manifests
print("******")
train_transcripts = data_dir + '/an4/etc/an4_train.transcription'
train_manifest = data_dir + '/an4/train_manifest.json'
if not os.path.isfile(train_manifest):
    build_manifest(train_transcripts, train_manifest, 'an4/wav/an4_clstk')
    print("Training manifest created.")

test_transcripts = data_dir + '/an4/etc/an4_test.transcription'
test_manifest = data_dir + '/an4/test_manifest.json'
if not os.path.isfile(test_manifest):
    build_manifest(test_transcripts, test_manifest, 'an4/wav/an4test_clstk')
    print("Test manifest created.")
print("***Done***")

******
***Done***


In this tutorial we will look at how to finetune a model for situations where we can sample background noises for an application. We will use the background noise samples from "Room Impulse Response and Noise Database" from the openslr database. For each 30 second isotropic noise sample in the dataset we use the first 15 seconds for training and the last 15 seconds for evaluation.

In [5]:
# Download the dataset. This will take a few moments...
print("******")
if not os.path.exists(data_dir + '/rirs_noises.zip'):
    slr28_url = 'https://www.openslr.org/resources/28/rirs_noises.zip'
    noise_path = wget.download(slr28_url, data_dir)
    print(f"Dataset downloaded at: {slr28_url}")
else:
    print("Zip file already exists.")
    noise_path = data_dir + '/rirs_noises.zip'
print("Finished downloading noises.\n******")

******
Dataset downloaded at: https://www.openslr.org/resources/28/rirs_noises.zip
Finished downloading noises.
******


In [6]:
# Extract noise data
from zipfile import ZipFile
try:
    with ZipFile(noise_path, "r") as zipObj:
        zipObj.extractall(data_dir)
except Exception:
    logging.info("Not extracting. Maybe already there?")


In [None]:
import json
iso_path = os.path.join(data_dir,"RIRS_NOISES/real_rirs_isotropic_noises")
iso_noise_list = os.path.join(iso_path, "noise_list")
# create manifest from noise files
def process_row(row, offset, duration):
  try:
    entry = {}
    wav_f = row['wav_filename']
    newfile = wav_f
    duration = subprocess.check_output(
      'soxi -D {0}'.format(newfile), shell=True)
    entry['audio_filepath'] = newfile
    entry['duration'] = float(duration)
    entry['offset'] = offset
    entry['text'] = row['transcript']
    return entry
  except Exception as e:
    wav_f = row['wav_filename']
    newfile = wav_f
    print(f"Error processing {newfile} file!!!")
    
train_rows = []
test_rows = []

with open(iso_noise_list,"r") as in_f:
    for line in in_f:
        row = {}
        data = line.rstrip().split()
        row['wav_filename']=os.path.join(data_dir,data[-1])
        row['transcript'] = "-"
        train_rows.append(process_row(row, 0 , 15))
        test_rows.append(process_row(row, 15 , 15))





In [26]:
def write_manifest(manifest_file, manifest_lines):
    with open(manifest_file, 'w') as fout:
      for m in manifest_lines:
        fout.write(json.dumps(m) + '\n')

test_noise_manifest = os.path.join(data_dir, "test_noise.json")
train_noise_manifest = os.path.join(data_dir, "train_noise.json")
write_manifest(test_noise_manifest, test_rows)
write_manifest(train_noise_manifest, train_rows)

No let's create an evaluation set using the an4 test set by adding noise at 0 dB using a script in NeMo.

In [None]:
# Add noise to test set
run = f"python ../../scripts/dataset_processing/add_noise.py \
    --input_manifest={test_manifest} \
    --noise_manifest={test_noise_manifest} \
    --snrs=0 \
    --out_dir={data_dir}/noise_data"
!{run}
    

In [34]:
run=f"python ../../examples/asr/speech_to_text_eval.py \
    pretrained_name=stt_en_conformer_ctc_medium \
    dataset_manifest={data_dir}/noise_data/manifests/test_manifest_test_noise_0db.json"
!{run}

[NeMo W 2022-04-17 23:54:56 optimizers:55] Apex was not found. Using the lamb or fused_adam optimizer will error out.
[NeMo I 2022-04-17 23:54:57 transcribe_speech:100] Hydra config: model_path: null
    pretrained_name: stt_en_conformer_ctc_medium
    audio_dir: null
    dataset_manifest: /home/jbalam/nemo/clone3/NeMo/tutorials/asr/noise_data/manifests/test_manifest_test_noise_0db.json
    output_filename: evaluation_transcripts.json
    batch_size: 32
    num_workers: 0
    cuda: null
    amp: false
    audio_type: wav
    overwrite_transcripts: true
    rnnt_decoding:
      strategy: greedy_batch
      compute_hypothesis_token_set: false
      preserve_alignments: null
      fused_batch_size: -1
      greedy:
        max_symbols_per_step: 10
        preserve_alignments: false
      beam:
        beam_size: 4
        search_type: default
        score_norm: true
        return_best_hypothesis: true
        tsd_max_sym_exp_per_step: 50
        alsd_max_target_len: 1.0
        nsc_max_

To finetune the pretrained model wtih noise augmentation we need to add an augmentor to the trainer. In this case we will Noise_perturbation augmentation to add noise to our training data. To achieve this we will ned to add the following lines to our trainer:
```yaml

trainer:
    train_ds:
        augmentor:
            noise:
              prob: 0.1
              manifest_path: "/path/to/train_noise.json"
              min_snr_db: 0
              max_snr_db: 30
```
With the above lines, we added a noise agumentor to our training dataset. With a low augmentation probability (0.1) we make sure that the model has good accuracy on clean data while improving accuracy on noisy data. If you expect your test data to be mostly noisy, increasing the probability will tweak the model's performance to be better for noisy data with a degaradation in clean speech accuracy. Alsso, with the above settings we are adding noise at an SNR that is randomly chosen between 0 to 30 db, these values can be adjusted according to the expected SNR for a given application. 