In [None]:
#@title Install Libraries for Quartznet
!pip install git+https://github.com/cehorn/GLRM.git 
!pip3 install nemo-toolkit  # installs NeMo Core
!pip3 install nemo-asr # installs NeMo ASR collection
!pip3 install nemo-nlp # installs NeMo NLP collection
!pip3 install nemo-tts # installs NeMo TTS collection
!pip3 install kaldiio
!pip3 install pillow>=4.3.0
!pip3 install torch
!pip3 install ipython[all]
!pip3 install tqdm
!pip3 install sox
!pip3 install ruamel.yaml
!pip3 install jupyterlab
!pip3 install tqdm
!pip3 install boto3
!pip3 install requests
!pip3 install six
!pip3 install ipdb
!pip3 install h5py
!pip3 install html2text
!pip3 install nltk
!pip3 install progressbar
!pip3 install matplotlib
!pip3 install wget 
!pip3 install tensorboardX
!pip3 install pandas
!pip3 install onnx
!pip3 install wget
!pip3 install num2words
!pip3 install librosa
!pip3 install inflect
!pip3 install kaldi-io
!pip3 install marshmallow
!pip3 install unidecode
!pip3 install sentencepiece
!pip3 install boto3
!pip3 install matplotlib
!pip3 install h5py
!pip3 install youtokentome
!pip3 install pydub
!pip3 install frozendict
!pip3 install pyannote.core
!pip3 install pyannote.metrics
!pip3 install g2p_en


# Importing the model

In [None]:
import nemo.collections.asr as nemo_asr
model = nemo_asr.models.ASRModel.from_pretrained(model_name="stt_de_quartznet15x5")

# use nemo_asr.models.ASRModel.restore_from() to load a model you trained yourself

In [None]:
#@title Download playlist from Youtube
!git clone https://gitlab.com/Jaco-Assistant/corcua.git
!pip3 install -e corcua/
from corcua.corcua import downloaders
import os
link = (
            "https://www.youtube.com/watch?v=erDUXM8mCS8&list=UUwRH985XgMYXQ6NxXDo8npw"
        )
path = os.path.join("dataset", "kurzgesagt") #set path for download
downloaders.youtube.Downloader().download_dataset(
            path=path, overwrite=True, args={"link": link, "lang": "de"}
        )



In [None]:
#@title Installing Dependencies for Data processing
!sudo apt-get install libasound2-plugins libasound2-python libsox-fmt-all
!sudo apt-get install -y sox
!sudo apt-get install sox libsox-fmt-mp3
!pip3 install num2words

In [None]:
#@title Defining Processing Functions

import json
import os
from typing import List
import re
import tqdm
import sox
from sox import Transformer
from pathlib import Path
from num2words import num2words

# ==================================================================================================

def process(x,output_wav_path): #transform the audio into mono, wav and 16000Hz

    tfm = Transformer()
    tfm.rate(samplerate=16000)
    tfm.channels(n_channels=1)
    print(x,"\n",output_wav_path)
    tfm.build(input_filepath=x, output_filepath=output_wav_path)
    duration = sox.file_info.duration(x)
    return duration

def load_dataset(args): #loads the downloaded dataset and splits it and save it
    if "path" not in args:
        raise AttributeError("Some arguments are missing")

    print("\nLoading transcripts ...")
    align_path = os.path.join(args["path"], "alignment.json")
    with open(align_path, "r", encoding="utf-8") as file:
        aligns = json.load(file)
    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\…\{\}\【\】\・\。\『\』\、\ー\〜\/\-\°\–]'
    chars_to_ignore_regex = "[^0-9a-zA-Z\söÖüÜäÄß]+"
    dataset = []
    dropped = 0
    for ii,a in enumerate(tqdm.tqdm(aligns)):
        file_path = a
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        #audio_path = os.path.join(audio_clips_path, file_path)
        output_path = os.path.join(args["save_dir"],"wav")
        output_wav_path = os.path.join(output_path, file_name + '.wav')
        os.makedirs(output_path, exist_ok=True)
        # Drop files including signs for notes
        try:
          duration = process(a,output_wav_path)
        except:
          continue
        drop = False
        for s in "*([":
            if aligns[a].startswith(s):
                drop = True
                break
        if drop:
            dropped += 1
            continue
        if duration<0.5: #drop files that are too short
          continue
        flag = False
        for ele in aligns[a]: #replace numbers with words (1 -> eins)
          
          if ele.isdigit():
              try:
                  K = num2words(ele, lang="de")
                  aligns[a] = aligns[a].replace(ele, K)
              except:
                flag= True
        if flag == True:
          continue
        aligns[a] = re.sub(chars_to_ignore_regex, '', aligns[a]).lower() #remove special charecters
        if len(aligns[a])==0: #drop empty 
          continue
        entry = {
            "audio_filepath": output_wav_path,
            "duration": duration,
            "text": aligns[a].strip(),
        }
        print(entry)
        dataset.append(entry)

    train_i = int(0.9* (len(dataset)))
    test_i = int(0.05* (len(dataset)))
    val_i = len(dataset) - train_i - test_i
    

    output_file= Path(os.path.join(args["save_dir"], "train_manifest.json"))
    with output_file.open(mode='w+',encoding="utf-8") as f:
        for d  in tqdm.tqdm(dataset[:train_i], total=train_i):
            wav_path, duration, text =d.values()
            f.write(
                json.dumps({'audio_filepath': os.path.abspath(wav_path), "duration": duration, 'text': text},ensure_ascii=False) + '\n'
            )
    output_file= Path(os.path.join(args["save_dir"], "dev_manifest.json"))
    with output_file.open(mode='w+',encoding="utf-8") as f:
        for d in tqdm.tqdm(dataset[train_i:val_i+train_i], total=val_i):
            wav_path, duration, text =d.values()
            
            f.write(
                json.dumps({'audio_filepath': os.path.abspath(wav_path), "duration": duration, 'text': text},ensure_ascii=False) + '\n'
            )
    output_file= Path(os.path.join(args["save_dir"], "test_manifest.json"))
    with output_file.open(mode='w+',encoding="utf-8") as f:
        for d in tqdm.tqdm(dataset[-test_i:], total=test_i):
            wav_path, duration, text =d.values()
            f.write(
                json.dumps({'audio_filepath': os.path.abspath(wav_path), "duration": duration, 'text': text},ensure_ascii=False) + '\n'
            )

    msg = "Dropped {}/{} files with notes"
    print(msg.format(dropped, len(aligns)))

    return dataset

def read_manifest(path): #for reading the dataset
    manifest = []
    with open(path, 'r',encoding="utf-8") as f:
        for line in tqdm.tqdm(f, desc="Reading manifest data"):
            line = line.replace("\n", "")
            data = json.loads(line)
            manifest.append(data)
    return manifest
from collections import defaultdict

def get_charset(manifest_data): #for getting charecters appearing in files
    charset = defaultdict(int)
    for row in tqdm.tqdm(manifest_data, desc="Computing character set"):
        text = row['text']
        for character in text:
            charset[character] += 1
    return charset

In [None]:
#load_dataset({"path": "path to downloaded dataset","save_dir":"where to save the processed dataset"})
ds=load_dataset({"path": "dataset/kurzgesagt/","save_dir":"datasets/kurzgesagt"})

Reading the dataset and checking the charecters

In [None]:
train_manifest_data = read_manifest("datasets/kurzgesagt/train_manifest.json")

dev_manifest_data = read_manifest("datasets/kurzgesagt/dev_manifest.json")

test_manifest_data = read_manifest("datasets/kurzgesagt/test_manifest.json")

train_charset = get_charset(train_manifest_data)
dev_charset = get_charset(dev_manifest_data)
test_charset = get_charset(test_manifest_data)

print(train_charset,"\n",
dev_charset)

Set freeze_encoder to true if the dataset is relatively small (less than 700hrs)  otherwise set to false. This freezes the encoder but not the decoder 

In [None]:
import torch
import torch.nn as nn

def enable_bn_se(m):
    if type(m) == nn.BatchNorm1d:
        m.train()
        for param in m.parameters():
            param.requires_grad_(True)

    if 'SqueezeExcite' in type(m).__name__:
        m.train()
        for param in m.parameters():
            param.requires_grad_(True)
freeze_encoder = True      
if freeze_encoder:
  model.encoder.freeze()
  model.encoder.apply(enable_bn_se)
  #logging.info("Model encoder has been frozen, and batch normalization has been unfrozen")
else:
  model.encoder.unfreeze()
  #logging.info("Model encoder has been un-frozen")

Settings for the models

In [None]:
import copy
from omegaconf import OmegaConf, open_dict

cfg = copy.deepcopy(model.cfg)
train_path ="datasets/kurzgesagt/train_manifest.json"
dev_path = "datasets/kurzgesagt/dev_manifest.json"
test_path = "datasets/kurzgesagt/test_manifest.json"

with open_dict(cfg):    
  # Train dataset  (Concatenate train manifest cleaned and dev manifest cleaned)
  cfg.train_ds.manifest_filepath = f"{train_path},{dev_path}"
  #cfg.train_ds.labels = list(train_charset)
  cfg.train_ds.normalize_transcripts = False
  cfg.train_ds.batch_size = 32
  cfg.train_ds.num_workers = 8
  cfg.train_ds.pin_memory = True
  cfg.train_ds.trim_silence = True

  # Validation dataset  (Use test dataset as validation, since we train using train + dev)
  cfg.validation_ds.manifest_filepath = test_path
  #cfg.validation_ds.labels = list(train_charset)
  cfg.validation_ds.normalize_transcripts = False
  cfg.validation_ds.batch_size = 8
  cfg.validation_ds.num_workers = 8
  cfg.validation_ds.pin_memory = True
  cfg.validation_ds.trim_silence = True

model.setup_training_data(cfg.train_ds)
model.setup_multiple_validation_data(cfg.validation_ds)

Settings for the optimizer

In [None]:
with open_dict(model.cfg.optim):
  model.cfg.optim.lr = 0.01
  model.cfg.optim.betas = [0.95, 0.5]  # from paper
  model.cfg.optim.weight_decay = 0.001  # Original weight decay
  model.cfg.optim.sched.warmup_steps = None  # Remove default number of steps of warmup
  model.cfg.optim.sched.warmup_ratio = 0.05  # 5 % warmup
  model.cfg.optim.sched.min_lr = 1e-5

Settings for the Augmentaions (increase for small datasets) 

In [None]:
 with open_dict(model.cfg.spec_augment):
   model.cfg.spec_augment.freq_masks = 2
   model.cfg.spec_augment.freq_width = 25
   model.cfg.spec_augment.time_masks = 2
   model.cfg.spec_augment.time_width = 0.05

model.spec_augmentation = model.from_config_dict(model.cfg.spec_augment)
model._wer.use_cer = True
model._wer.log_prediction = True

Settings for the trainer

In [None]:
import torch
import pytorch_lightning as ptl

if torch.cuda.is_available():
  gpus = 1
else:
  gpus = 0

EPOCHS = 150  # No less than 100 epochs 

trainer = ptl.Trainer(gpus=gpus, 
                      max_epochs=EPOCHS, 
                      accumulate_grad_batches=1,
                      checkpoint_callback=False,
                      logger=False,
                      log_every_n_steps=5,
                      check_val_every_n_epoch=10)

# Setup model with the trainer
model.set_trainer(trainer)

# Finally, update the model's internal config
model.cfg = model._cfg

In [None]:
from nemo.utils import exp_manager

# Environment variable generally used for multi-node multi-gpu training.
# In notebook environments, this flag is unnecessary and can cause logs of multiple training runs to overwrite each other.
os.environ.pop('NEMO_EXPM_VERSION', None)

config = exp_manager.ExpManagerConfig(
    exp_dir=f'experiments/',
    name=f"ASR-Model-Language",
    checkpoint_callback_params=exp_manager.CallbackParams(
        monitor="val_wer",
        mode="min",
        always_save_nemo=True,
        save_best_model=True,
    ),
)

config = OmegaConf.structured(config)

logdir = exp_manager.exp_manager(trainer, config)

In [None]:
try:
  from google import colab
  COLAB_ENV = True
except (ImportError, ModuleNotFoundError):
  COLAB_ENV = False

# Load the TensorBoard notebook extension
if COLAB_ENV:
  %load_ext tensorboard
  %tensorboard --logdir /content/experiments/
else:
  print("To use tensorboard, please use this notebook in a Google Colab environment.")

Train

In [None]:
trainer.fit(model)

In [None]:
save_path = f"Model-de.nemo"
model.save_to(f"{save_path}")
print(f"Model saved at path : {os.getcwd() + os.path.sep + save_path}")

#Inferencing a *file*

In [None]:
!ffmpeg -loglevel panic -y -i INPUTFILE.wav -acodec pcm_s16le -ac 1 -ar 16000 OUTFILE.wav  # this transform the sound file to mono and 16000Hz 


In [None]:
model.transcribe(paths2audio_files=["OUTFILE.wav"], batch_size=1, logprobs=False)