# NeMo offline ASR
Steps:

* transcribe an audio file (offline ASR) with greedy decoder
* extract timestamps information from the model to split audio into separate words
* use beam search decoder with N-gram language model re-scoring


In [None]:
try:
    from plotly import graph_objects as go
except ModuleNotFoundError:
    !pip install plotly
    from plotly import graph_objects as go

import os
import ipywidgets
from plotly import graph_objects as go
import ctc_decoders
import nemo.collections.asr as nemo_asr
import glob
import torchaudio
import soundfile as sf

import numpy as np
# Import audio processing library
import librosa
# We'll use this to listen to audio
from IPython.display import Audio, display

In [None]:
nemo_asr.models.EncDecCTCModel.list_available_models()

## Instantiate pre-trained NeMo model
``from_pretrained(...)`` API downloads and initializes model directly from the cloud. 

Alternatively, ``restore_from(...)`` allows loading a model from a disk.

To display available pre-trained models from the cloud, please use ``list_available_models()`` method.

Let's load a base English QuartzNet15x5 model.

In [None]:
asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="stt_de_conformer_ctc_large")

## Get test audio clip

Let's download and analyze a test audio signal.

In [None]:
audio_directory = '/data/voice/SZ_parliament/raw/parts'

In [None]:
files = glob.glob(os.path.join(audio_directory, '*.wav'))
print(files)

In [None]:
AUDIO_FILENAME = files[0]
target_sr = 16000

y, sr = librosa.load(AUDIO_FILENAME, sr=None)
if sr != 16000:
    y_resampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
sf.write('test.wav', y_resampled, target_sr)

In [None]:
test_file = ['test.wav']

transcript = asr_model.transcribe(paths2audio_files=test_file)[0]
print(f'Transcript: "{transcript}"')


In [None]:
# display audio player for the signal
display(Audio(data=y, rate=sr))

In [None]:
# plot the signal in time domain
fig_signal = go.Figure(
    go.Scatter(x=np.arange(y.shape[0])/sr,
               y=y, line={'color': 'green'},
               name='Waveform',
               hovertemplate='Time: %{x:.2f} s<br>Amplitude: %{y:.2f}<br><extra></extra>'),
    layout={
        'height': 300,
        'xaxis': {'title': 'Time, s'},
        'yaxis': {'title': 'Amplitude'},
        'title': 'Audio Signal',
        'margin': dict(l=0, r=0, t=40, b=0, pad=0),
    }
)
fig_signal.show()

In [None]:
# calculate amplitude spectrum
time_stride=0.01
hop_length = int(sample_rate*time_stride)
n_fft = 512
# linear scale spectrogram
s = librosa.stft(y=signal,
                 n_fft=n_fft,
                 hop_length=hop_length)
s_db = librosa.power_to_db(np.abs(s)**2, ref=np.max, top_db=100)

# plot the signal in frequency domain
fig_spectrum = go.Figure(
    go.Heatmap(z=s_db,
               colorscale=[
                   [0, 'rgb(30,62,62)'],
                   [0.5, 'rgb(30,128,128)'],
                   [1, 'rgb(30,255,30)'],
               ],
               colorbar=dict(
                   ticksuffix=' dB'
               ),
               dx=time_stride, dy=sample_rate/n_fft/1000,
               name='Spectrogram',
               hovertemplate='Time: %{x:.2f} s<br>Frequency: %{y:.2f} kHz<br>Magnitude: %{z:.2f} dB<extra></extra>'),
    layout={
        'height': 300,
        'xaxis': {'title': 'Time, s'},
        'yaxis': {'title': 'Frequency, kHz'},
        'title': 'Spectrogram',
        'margin': dict(l=0, r=0, t=40, b=0, pad=0),
    }
)
fig_spectrum.show()

## Offline inference
If we have an entire audio clip available, then we can do offline inference with a pre-trained model to transcribe it.

The easiest way to do it is to call ASR model's ``transcribe(...)`` method  that allows transcribing multiple files in a batch.

In [None]:
AUDIO_FILENAME = files[0]
target_sr = 16000

y, sr = librosa.load(AUDIO_FILENAME, sr=None)
if sr != 16000:
    y_resampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
sf.write('test.wav', y_resampled, target_sr)

test_file = ['test.wav']

transcript = asr_model.transcribe(paths2audio_files=test_file)[0]
print(f'Transcript: "{transcript}"')

## Extract timestamps and split words
``transcribe()`` generates a text applying a CTC greedy decoder to raw probabilities distribution over alphabet's characters from ASR model. We can get those raw probabilities with ``logprobs=True`` argument.

In [None]:
# softmax implementation in NumPy
def softmax(logits):
    e = np.exp(logits - np.max(logits))
    return e / e.sum(axis=-1).reshape([logits.shape[0], 1])

# let's do inference once again but without decoder
logits = asr_model.transcribe(test_file, logprobs=True)[0]
probs = softmax(logits)
print(probs)

In [None]:
# get model's alphabet
labels = list(asr_model.decoder.vocabulary) + ['blank']
labels[0] = 'space'

# plot probability distribution over characters for each timestep
fig_probs = go.Figure(
    go.Heatmap(z=probs.transpose(),
               colorscale=[
                   [0, 'rgb(30,62,62)'],
                   [1, 'rgb(30,255,30)'],
               ],
               y=labels,
               dx=time_stride,
               name='Probs',
               hovertemplate='Time: %{x:.2f} s<br>Character: %{y}<br>Probability: %{z:.2f}<extra></extra>'),
    layout={
        'height': 300,
        'xaxis': {'title': 'Time, s'},
        'yaxis': {'title': 'Characters'},
        'title': 'Character Probabilities',
        'margin': dict(l=0, r=0, t=40, b=0, pad=0),
    }
)
fig_probs.show()

In [None]:
# softmax implementation in NumPy
def softmax(logits):
    e = np.exp(logits - np.max(logits))
    return e / e.sum(axis=-1).reshape([logits.shape[0], 1])

# let's do inference once again but without decoder
logits = asr_model.transcribe(files, logprobs=True)[0]
probs = softmax(logits)

# 20ms is duration of a timestep at output of the model
time_stride = 0.02

# get model's alphabet
labels = list(asr_model.decoder.vocabulary) + ['blank']
labels[0] = 'space'

# plot probability distribution over characters for each timestep
fig_probs = go.Figure(
    go.Heatmap(z=probs.transpose(),
               colorscale=[
                   [0, 'rgb(30,62,62)'],
                   [1, 'rgb(30,255,30)'],
               ],
               y=labels,
               dx=time_stride,
               name='Probs',
               hovertemplate='Time: %{x:.2f} s<br>Character: %{y}<br>Probability: %{z:.2f}<extra></extra>'),
    layout={
        'height': 300,
        'xaxis': {'title': 'Time, s'},
        'yaxis': {'title': 'Characters'},
        'title': 'Character Probabilities',
        'margin': dict(l=0, r=0, t=40, b=0, pad=0),
    }
)
fig_probs.show()

It is easy to identify timesteps for space character.

In [None]:
# get timestamps for space symbols
spaces = []

state = ''
idx_state = 0

if np.argmax(probs[0]) == 0:
    state = 'space'

for idx in range(1, probs.shape[0]):
    current_char_idx = np.argmax(probs[idx])
    if state == 'space' and current_char_idx != 0 and current_char_idx != 28:
        spaces.append([idx_state, idx-1])
        state = ''
    if state == '':
        if current_char_idx == 0:
            state = 'space'
            idx_state = idx

if state == 'space':
    spaces.append([idx_state, len(pred)-1])

Then we can split original audio signal into separate words. It is worth to mention that all timestamps have a delay (or an offset) depending on the model. We need to take it into account for alignment.

In [None]:
# calibration offset for timestamps: 180 ms
offset = -0.18

# split the transcript into words
words = transcript.split()

# cut words
pos_prev = 0
for j, spot in enumerate(spaces):
    display(words[j])
    pos_end = offset + (spot[0]+spot[1])/2*time_stride
    display(Audio(signal[int(pos_prev*sample_rate):int(pos_end*sample_rate)],
                 rate=sample_rate))
    pos_prev = pos_end

display(words[j+1])
display(Audio(signal[int(pos_prev*sample_rate):],
        rate=sample_rate))

## Offline inference with beam search decoder and N-gram language model re-scoring

It is possible to use an external [KenLM](https://kheafield.com/code/kenlm/)-based N-gram language model to rescore multiple transcription candidates. 

Let's download and preprocess LibriSpeech 3-gram language model.

In [None]:
import gzip
import os, shutil, wget

lm_gzip_path = '3-gram.pruned.1e-7.arpa.gz'
if not os.path.exists(lm_gzip_path):
    print('Downloading pruned 3-gram model.')
    lm_url = 'http://www.openslr.org/resources/11/3-gram.pruned.1e-7.arpa.gz'
    lm_gzip_path = wget.download(lm_url)
    print('Downloaded the 3-gram language model.')
else:
    print('Pruned .arpa.gz already exists.')

uppercase_lm_path = '3-gram.pruned.1e-7.arpa'
if not os.path.exists(uppercase_lm_path):
    with gzip.open(lm_gzip_path, 'rb') as f_zipped:
        with open(uppercase_lm_path, 'wb') as f_unzipped:
            shutil.copyfileobj(f_zipped, f_unzipped)
    print('Unzipped the 3-gram language model.')
else:
    print('Unzipped .arpa already exists.')

lm_path = 'lowercase_3-gram.pruned.1e-7.arpa'
if not os.path.exists(lm_path):
    with open(uppercase_lm_path, 'r') as f_upper:
        with open(lm_path, 'w') as f_lower:
            for line in f_upper:
                f_lower.write(line.lower())
print('Converted language model file to lowercase.')

Let's instantiate ``BeamSearchDecoderWithLM`` module.

In [None]:
beam_search_lm = nemo_asr.modules.BeamSearchDecoderWithLM(
    vocab=list(asr_model.decoder.vocabulary),
    beam_width=16,
    alpha=2, beta=1.5,
    lm_path=lm_path,
    num_cpus=max(os.cpu_count(), 1),
    input_tensor=False)

Now we can check all transcription candidates along with their scores.

In [None]:
beam_search_lm.forward(log_probs = np.expand_dims(probs, axis=0), log_probs_length=None)