### Imports and stuff

In [None]:
# Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg libsox-fmt-mp3
!pip install text-unidecode
!pip install matplotlib>=3.3.2

## Install NeMo
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]


In [None]:
import os
import glob
import subprocess
import tarfile
import wget
import copy
from omegaconf import OmegaConf, open_dict

In [None]:
data_dir = 'datasets/'

if not os.path.exists(data_dir):
  os.makedirs(data_dir, exist_ok=True)

if not os.path.exists("scripts"):
  os.makedirs("scripts")

In [None]:
import nemo
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.metrics.wer import word_error_rate
from nemo.utils import logging, exp_manager

### Dataset stuff

#### Getting dataset from hugging face and converting to hugging face

In [None]:
from huggingface_hub import login
login()

In [None]:
VERSION = "lemorim/noisy-dataset" # dataset name
LANGUAGE = "default" # dataset language
manifest_dir = os.path.join('datasets', VERSION)

In [None]:
if not os.path.exists("convert_hf_dataset_to_nemo.py"):
    !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/speech_recognition/convert_hf_dataset_to_nemo.py

In [None]:
!python convert_hf_dataset_to_nemo.py \
    output_dir=datasets/$LANGUAGE \
    path=$VERSION \
    name=$LANGUAGE \
    split="train" \
    ensure_ascii=False \
    use_auth_token=True

!python convert_hf_dataset_to_nemo.py \
    output_dir=datasets/$LANGUAGE \
    path=$VERSION \
    name=$LANGUAGE \
    split="validation" \
    ensure_ascii=False \
    use_auth_token=True

!python convert_hf_dataset_to_nemo.py \
    output_dir=datasets/$LANGUAGE \
    path=$VERSION \
    name=$LANGUAGE \
    split="test" \
    ensure_ascii=False \
    use_auth_token=True

In [None]:
train_manifest = f"{manifest_dir}/train/train_lemorim_noisy-dataset_manifest.json"
dev_manifest = f"{manifest_dir}/validation/validation_lemorim_noisy-dataset_manifest.json"
test_manifest = f"{manifest_dir}/test/test_lemorim_noisy-dataset_manifest.json"

#### Preparing dataset for training

In [None]:
# Manifest Utils
from tqdm.auto import tqdm
from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
import json


def write_processed_manifest(data, original_path):
    original_manifest_name = os.path.basename(original_path)
    new_manifest_name = original_manifest_name.replace(".json", "_processed.json")

    manifest_dir = os.path.split(original_path)[0]
    filepath = os.path.join(manifest_dir, new_manifest_name)
    write_manifest(filepath, data)
    print(f"Finished writing manifest: {filepath}")
    return filepath

In [None]:
train_manifest_data = read_manifest(train_manifest)
dev_manifest_data = read_manifest(dev_manifest)
test_manifest_data = read_manifest(test_manifest)

In [None]:
train_text = [data['text'] for data in train_manifest_data]
dev_text = [data['text'] for data in dev_manifest_data]
test_text = [data['text'] for data in test_manifest_data]

#### Removing special characters

In [None]:
# Preprocessing steps
import re
import unicodedata

chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\…\{\}\【\】\・\。\『\』\、\ー\〜]'  # remove special character tokens
# kanji_removal_regex = '[' + "".join([f"\{token}" for token in extra_kanji]) + ']'  # remove test set kanji


def remove_special_characters(data):
    data["text"] = re.sub(chars_to_ignore_regex, '', data["text"]).lower().strip()
    return data

# Processing pipeline
def apply_preprocessors(manifest, preprocessors):
    for processor in preprocessors:
        for idx in tqdm(range(len(manifest)), desc=f"Applying {processor.__name__}"):
            manifest[idx] = processor(manifest[idx])

    print("Finished processing manifest !")
    return manifest

In [None]:
# List of pre-processing functions
PREPROCESSORS = [
    remove_special_characters,
]

In [None]:
# Load manifests
train_data = read_manifest(train_manifest)
dev_data = read_manifest(dev_manifest)
test_data = read_manifest(test_manifest)

# Apply preprocessing
train_data_processed = apply_preprocessors(train_data, PREPROCESSORS)
dev_data_processed = apply_preprocessors(dev_data, PREPROCESSORS)
test_data_processed = apply_preprocessors(test_data, PREPROCESSORS)

# Write new manifests
train_manifest_cleaned = write_processed_manifest(train_data_processed, train_manifest)
dev_manifest_cleaned = write_processed_manifest(dev_data_processed, dev_manifest)
test_manifest_cleaned = write_processed_manifest(test_data_processed, test_manifest)


### Actually finetuning

In [None]:
# list all asr models available
# nemo_asr.models.ASRModel.list_available_models()

#### Character encoding CTC Model

In [None]:
char_model = nemo_asr.models.ASRModel.from_pretrained("stt_en_conformer_ctc_large", map_location='cpu')

In [None]:
char_model.setup_training_data(train_data_config={
    "manifest_filepath": [
        train_manifest_cleaned
    ],
    "sample_rate": 16000,
    "batch_size": 1,
    "shuffle": True,
    "num_workers": 4,
    "pin_memory": True,
    "use_start_end_token": False,
    "trim_silence": False,
    "max_duration": 20.0,
    "min_duration": 0.1,
    "is_tarred": False,
    "shuffle_n": 2048,
    "bucketing_strategy": "synced_randomized",
    "bucketing_batch_size": [34, 30, 26, 22, 18, 16, 12, 8]
}
)


#### Setting up data loaders

In [None]:
cfg = copy.deepcopy(char_model.cfg)

In [None]:
# Setup train, validation, test configs
with open_dict(cfg):
  # Train dataset  (Concatenate train manifest cleaned and dev manifest cleaned)
  cfg.train_ds.manifest_filepath = f"{train_manifest_cleaned},{dev_manifest_cleaned}"
  cfg.train_ds.labels = list(train_dev_set)
  cfg.train_ds.normalize_transcripts = False
  cfg.train_ds.batch_size = 16
  cfg.train_ds.num_workers = 8
  cfg.train_ds.pin_memory = True
  cfg.train_ds.trim_silence = True

  # Validation dataset  (Use test dataset as validation, since we train using train + dev)
  cfg.validation_ds.manifest_filepath = test_manifest_cleaned
  cfg.validation_ds.labels = list(train_dev_set)
  cfg.validation_ds.normalize_transcripts = False
  cfg.validation_ds.batch_size = 8
  cfg.validation_ds.num_workers = 8
  cfg.validation_ds.pin_memory = True
  cfg.validation_ds.trim_silence = True

In [None]:
# setup data loaders with new configs
char_model.setup_training_data(cfg.train_ds)
char_model.setup_multiple_validation_data(cfg.validation_ds)

#### Setting up optimizer and sceduler

In [None]:
# Original optimizer + scheduler
print(OmegaConf.to_yaml(char_model.cfg.optim))

In [None]:
with open_dict(char_model.cfg.optim):
  char_model.cfg.optim.lr = 0.01
  char_model.cfg.optim.betas = [0.95, 0.5]  # from paper
  char_model.cfg.optim.weight_decay = 0.001  # Original weight decay
  char_model.cfg.optim.sched.warmup_steps = None  # Remove default number of steps of warmup
  char_model.cfg.optim.sched.warmup_ratio = 0.05  # 5 % warmup
  char_model.cfg.optim.sched.min_lr = 1e-5

#### Setting up augmentation

In [None]:
print(OmegaConf.to_yaml(char_model.cfg.spec_augment))

In [None]:
char_model.spec_augmentation = char_model.from_config_dict(char_model.cfg.spec_augment)

#### Setup metrics

In [None]:
use_cer = True
log_prediction = True

char_model.wer.use_cer = use_cer
char_model.wer.log_prediction = log_prediction

#### Setup Trainer and experiment manager

In [None]:
import torch
import pytorch_lightning as ptl

if torch.cuda.is_available():
  accelerator = 'gpu'
else:
  accelerator = 'cpu'

EPOCHS = 30  # 100 epochs would provide better results, but would take an hour to train

trainer = ptl.Trainer(devices=1,
                      accelerator=accelerator,
                      max_epochs=EPOCHS,
                      accumulate_grad_batches=1,
                      enable_checkpointing=False,
                      logger=False,
                      log_every_n_steps=5,
                      check_val_every_n_epoch=10)

# Setup model with the trainer
char_model.set_trainer(trainer)

# Finally, update the model's internal config
char_model.cfg = char_model._cfg

In [None]:
# Environment variable generally used for multi-node multi-gpu training.
# In notebook environments, this flag is unnecessary and can cause logs of multiple training runs to overwrite each other.
os.environ.pop('NEMO_EXPM_VERSION', None)

config = exp_manager.ExpManagerConfig(
    exp_dir=f'experiments/lang-{LANGUAGE}/',
    name=f"ASR-Char-Model-Language-{LANGUAGE}",
    checkpoint_callback_params=exp_manager.CallbackParams(
        monitor="val_wer",
        mode="min",
        always_save_nemo=True,
        save_best_model=True,
    ),
)

config = OmegaConf.structured(config)

logdir = exp_manager.exp_manager(trainer, config)

In [None]:
try:
  from google import colab
  COLAB_ENV = True
except (ImportError, ModuleNotFoundError):
  COLAB_ENV = False

# Load the TensorBoard notebook extension
if COLAB_ENV:
  %load_ext tensorboard
  %tensorboard --logdir /content/experiments/lang-$LANGUAGE/ASR-Char-Model-Language-$LANGUAGE/
else:
  print("To use tensorboard, please use this notebook in a Google Colab environment.")

In [None]:
%%time
trainer.fit(char_model)

### Save final model

In [None]:
save_path = f"{VERSION.split("/")[-1]}_{LANGUAGE}.nemo"
char_model.save_to(f"{save_path}")
print(f"Model saved at path : {os.getcwd() + os.path.sep + save_path}")