## Imports modules required to run

In [None]:
try:
    import librosa
except OSError:
    import sys
    import types
    sys.modules['soundfile'] = types.ModuleType('fake_soundfile')
    import librosa

import numpy as np
import scipy
import wave
from openvino.inference_engine import IECore
from os import path, makedirs, listdir
from shutil import copy

## Settings

In this part you have to set up all variables further used in notebook.

In [None]:
model_folder = "model"
download_folder = "output"
data_folder = "data"

precision = "FP16"
model_name = "quartznet-15x5-en"
model_extensions = ("bin", "xml")

## Download models and convert public model

We use `omz_downloader` and `omz_converter`, which are command-line tools from the `openvino-dev` package. `omz_downloader` automatically creates a directory structure and downloads the selected model. This step is skipped if the model is already downloaded. The selected model comes from the public directory, which means it must be converted into Intermediate Representation (IR).

`omz_converter` is needed to convert pre-trainded `PyTorch` model to OpenVINO IR format. 

If it is your first run models will download and convert here. It might take up to ten minutes. 

In [None]:
makedirs(download_folder, exist_ok=True)

# Check if models are already downloaded in download directory
for extension in model_extensions:
    if not path.isfile(f'{model_folder}/{model_name}.{extension}'):
        download_command = f"omz_downloader --name {model_name} --output_dir {download_folder} --precision {precision} --num_attempts 3"
        convert_command = f"omz_converter --name {model_name} --precisions {precision} --download_dir {download_folder} --output_dir {download_folder}"
        # Run commands, first download model than convert it to inferable 
        ! $download_command
        # Models are downloaded straight to output folder, we will keep all not used files outside of models directory
        ! $convert_command
        break


## Copy models to model folder

At this point both models are kept in download_folder (by default named ```output```). We need only .bin and .xml files from there that we will copy to ```model directory```.

In [None]:
makedirs(model_folder, exist_ok=True)

In [None]:
for file_name in listdir(f"{download_folder}/public/{model_name}/{precision}"):
    copy(src=f"{download_folder}/public/{model_name}/{precision}/{file_name}", dst=model_folder)

## Load audio file

Now, when model files are downloaded and converted, you need to load audio file. 

### Defining constants

First step will be locating audio file and defining alphabet used by model. In this case you will use latin alphabet begining with space symbol.

In [None]:
audio_file_name = "how_are_you_doing.wav"
alphabet = " abcdefghijklmnopqrstuvwxyz'"

### Load audio file

Next step is opening defined in previous cell audio file and getting params that will allow you to decide if file needs adjustments before placing into preprocessing function.

In [None]:
# Example of usage
wave_read = wave.open(f'{data_folder}/{audio_file_name}')
channel_num, sample_width, sampling_rate, pcm_length, compression_type, _ = wave_read.getparams()

### Assertions about audio file

For this model we can use audio files that meets those requirements:
* 16-bit WAV PCM
* without any compression type (linear PCM WAV)
* single channel (mono WAV PCM)
* 16 KHz audio

In [None]:
assert sample_width == 2, "Only 16-bit WAV PCM supported"
assert compression_type == 'NONE', "Only linear PCM WAV files supported"
assert channel_num == 1, "Only mono WAV PCM supported"
assert sampling_rate == 16000, "Only 16 KHz audio supported"

In [None]:
audio = wave_read.readframes(pcm_length * channel_num)
audio = np.frombuffer(audio, dtype=np.int16)
audio = audio.reshape((pcm_length, channel_num))

In [None]:
def audio_to_melspectrum(audio, sampling_rate, padding=16):
    assert sampling_rate == 16000, "Only 16 KHz audio supported"
    preemph = 0.97
    preemphased = np.concatenate([audio[:1], audio[1:] - preemph * audio[:-1].astype(np.float32)])

    win_length = round(sampling_rate * 0.02)
    spec = np.abs(librosa.core.spectrum.stft(preemphased, n_fft=512, hop_length=round(sampling_rate * 0.01),
        win_length=win_length, center=True, window=scipy.signal.windows.hann(win_length), pad_mode='reflect'))
    mel_basis = librosa.filters.mel(sampling_rate, 512, n_mels=64, fmin=0.0, fmax=8000.0, htk=False)
    log_melspectrum = np.log(np.dot(mel_basis, np.power(spec, 2)) + 2 ** -24)

    normalized = (log_melspectrum - log_melspectrum.mean(1)[:, None]) / (log_melspectrum.std(1)[:, None] + 1e-5)
    remainder = normalized.shape[1] % padding
    if remainder != 0:
        return np.pad(normalized, ((0, 0), (0, padding - remainder)))[None]
    return normalized[None]

def ctc_greedy_decode(pred):
    pred = np.squeeze(pred)
    prev_id = blank_id = len(alphabet)
    transcription = []
    for idx in pred.argmax(axis=1):
        if prev_id != idx != blank_id:
            transcription.append(alphabet[idx])
        prev_id = idx
    return ''.join(transcription)

In [None]:
audio = audio_to_melspectrum(audio.flatten(), sampling_rate)

In [None]:
ie = IECore()

net = ie.read_network(
    model=f"{model_folder}/{model_name}.xml"
)
net.reshape({next(iter(net.input_info)): audio.shape})
exec_net = ie.load_network(net, "CPU")

input_layer_ir = next(iter(exec_net.input_info))

In [None]:
character_probs = exec_net.infer({input_layer_ir: audio}).values()

character_probs = next(iter(character_probs))

In [None]:
transcription = ctc_greedy_decode(character_probs)
print(transcription)