## Imports modules required to run

In [None]:
import IPython.display as ipd
import librosa
import librosa.display
import soundfile as sf
import matplotlib.pyplot as plt
import numpy as np
import scipy
from openvino.inference_engine import IECore
from os import path

## Settings

In this part you have to set up all variables further used in notebook.

In [None]:
model_folder = "model"
download_folder = "output"
data_folder = "data"

precision = "FP16"
model_name = "quartznet-15x5-en"
model_extensions = ("bin", "xml")

## Download models and convert public model

We use `omz_downloader` and `omz_converter`, which are command-line tools from the `openvino-dev` package. `omz_downloader` automatically creates a directory structure and downloads the selected model. This step is skipped if the model is already downloaded. The selected model comes from the public directory, which means it must be converted into Intermediate Representation (IR).

`omz_converter` is needed to convert pre-trainded `PyTorch` model to ONNX model format, which is further converted to OpenVINO IR format. 

If it is your first run models will download and convert here. It might take up to ten minutes. 

In [None]:
# Check if models are already downloaded in download directory
if not path.isdir(f'{download_folder}/public/{model_name}'):
    download_command = f"omz_downloader --name {model_name} --output_dir {download_folder} --precision {precision} --num_attempts 3"
    ! $download_command


In [None]:
if not path.isdir(f'{model_folder}/public/{model_name}/{precision}'):
    convert_command = f"omz_converter --name {model_name} --precisions {precision} --download_dir {download_folder} --output_dir {model_folder}"
    ! $convert_command

## Load audio file

Now, when model files are downloaded and converted, you need to load audio file. 

### Defining constants

First step will be locating audio file and defining alphabet used by model. In this case you will use latin alphabet begining with space symbol and ending with blank symbol.

In [None]:
audio_file_name = "how_are_you_doing.wav"
alphabet = " abcdefghijklmnopqrstuvwxyz~"

### Load audio file

Next step is opening defined in previous cell audio file and getting params that will allow you to decide if file needs adjustments before placing into preprocessing function.

### All assertions met, whats next?

Now we need to read audio frames and change type of variables in buffer to int16.

In [None]:
audio_formats = list(sf.available_formats().keys())
print(f"Availble audio formats: {', '.join(audio_formats)}")

In [None]:
if audio_file_name.split('.')[-1].upper() not in audio_formats:
    raise Exception(f"Invalid file format. Availble formats: {', '.join(audio_formats)}")

In [None]:
audio, sampling_rate = librosa.load(path=f'{data_folder}/{audio_file_name}', sr=16000, mono=True)

In [None]:
plt.figure()
librosa.display.waveplot(audio ,sr=sampling_rate, max_points=50000.0, x_axis='time', offset=0.0, max_sr=1000);
plt.show()
specto_audio = librosa.stft(audio)
specto_audio = librosa.amplitude_to_db(np.abs(specto_audio), ref=np.max)
print(specto_audio.shape)
librosa.display.specshow(specto_audio, sr=sampling_rate);



In [None]:
if max(np.abs(audio)) <= 1:
    audio = (audio * (2**15 - 1))
audio = audio.astype(np.int16)

### Here comes magic!

After all those small convertion now we need to convert our pre-pre-processed audio to [Mel Spectrum](https://medium.com/analytics-vidhya/understanding-the-mel-spectrogram-fca2afa2ce53). Explaination why are you doing that is covered in multiple articles like [this one](https://towardsdatascience.com/audio-deep-learning-made-simple-part-2-why-mel-spectrograms-perform-better-aad889a93505).

In [None]:
def audio_to_melbasis(audio, sampling_rate,):
    assert sampling_rate == 16000, "Only 16 KHz audio supported"
    preemph = 0.97
    preemphased = np.concatenate([audio[:1], audio[1:] - preemph * audio[:-1].astype(np.float32)])

    win_length = round(sampling_rate * 0.02)
    spec = np.abs(librosa.core.spectrum.stft(preemphased, n_fft=512, hop_length=round(sampling_rate * 0.01), 
                  win_length=win_length, center=True, window=scipy.signal.windows.hann(win_length), pad_mode='reflect'))
    mel_basis = librosa.filters.mel(sampling_rate, 512, n_mels=64, fmin=0.0, fmax=8000.0, htk=False)
    return mel_basis, spec

def melbasis_to_melspectrum(mel_basis, spec, padding=16):
    log_melspectrum = np.log(np.dot(mel_basis, np.power(spec, 2)) + 2 ** -24)

    normalized = (log_melspectrum - log_melspectrum.mean(1)[:, None]) / (log_melspectrum.std(1)[:, None] + 1e-5)
    remainder = normalized.shape[1] % padding
    if remainder != 0:
        return np.pad(normalized, ((0, 0), (0, padding - remainder)))[None]
    return normalized[None]

In [None]:
mel_basis, spec = audio_to_melbasis(audio.flatten(), sampling_rate)

In [None]:
librosa.display.specshow(spec, sr=sampling_rate);
plt.show()
librosa.display.specshow(mel_basis, sr=sampling_rate);

In [None]:
audio = melbasis_to_melspectrum(mel_basis, spec)

## Running network

When everything is prepared, you can finally read and load network. You may choose to run the network on multiple devices by default it will load the model on the CPU (you can choose manually CPU, GPU, MYRIAD, etc.) or let the engine choose the best available device (AUTO).

To list all available devices that you can use, uncomment and run line `print(ie.available_devices)`.

In [None]:
ie = IECore()

In [None]:
print(ie.available_devices)

In [None]:
net = ie.read_network(
    model=f"{model_folder}/public/{model_name}/{precision}/{model_name}.xml"
)
net.reshape({next(iter(net.input_info)): audio.shape})
exec_net = ie.load_network(network=net, device_name="CPU")

### Run the inference!

Everything is set up. Now only thing remaining is passing input to previously loaded network and running inference!

In [None]:
input_layer_ir = next(iter(exec_net.input_info))

character_probabilities = exec_net.infer({input_layer_ir: audio}).values()

### Read output

After inference you need to reach out the output. Default output format for `quartznet 15x5` are per-frame probabilities (after LogSoftmax) for every symbol in the alphabet, name - output, shape - 1, 64, 29, output data format is B, N, C, where:

* B - batch size
* N - number of audio frames
* C - alphabet size, including the CTC blank symbol

You need to make it in a more human-readable format. To do this you need to get a symbol with the highest probability. When you hold a list of indexes that are predicted to have the highest probability, due to limitations given by [CTC Decoding](https://towardsdatascience.com/beam-search-decoding-in-ctc-trained-neural-networks-5a889a3d85a7) you will remove concurrent symbols and then remove all the blanks.

The last step is getting symbols from corresponding indexes in charlist.

In [None]:
character_probabilities = next(iter(character_probabilities))

# Remove unnececery dimension
character_probabilities = np.squeeze(character_probabilities)

# Run argmax to pick most possible symbols
character_probabilities = np.argmax(character_probabilities, axis=1)

### Implementation of CTC Decoding

To decode previously explained output we need [CTC decode](https://towardsdatascience.com/beam-search-decoding-in-ctc-trained-neural-networks-5a889a3d85a7) function.

In [None]:
def ctc_greedy_decode(predictions):
    previous_letter_id = blank_id = len(alphabet)
    transcription = list()
    for letter_index in predictions:
        if previous_letter_id != letter_index != blank_id:
            transcription.append(alphabet[letter_index])
        previous_letter_id = letter_index
    return ''.join(transcription)

### Run CTC decoding and print output.

In [None]:
transcription = ctc_greedy_decode(character_probabilities)
print(transcription)