# Transcriber

## Imports

In [None]:
import os
import shlex
import wave
import numpy as np
import random
from deepspeech import Model, printVersions
from timeit import default_timer as timer
import IPython.display as ipd
import subprocess
try:
    from shhlex import quote
except ImportError:
    from pipes import quote
from shutil import copyfile

## Constants

In [None]:
models_dir = '/opt/shared/models/'
audio_dir = '/opt/shared_data/cpm_wav_cut/'
lm_path = '/opt/shared/lm.binary'
trie_path = '/opt/shared/trie'

## Loading a Model
Load the best model from the model directory and load it with specified LM.

In [None]:
best_model = sorted(os.listdir(models_dir))[0]
load_start = timer()
ds = Model(os.path.join(models_dir, best_model), aBeamWidth=1024)
ds.enableDecoderWithLM(lm_path, trie_path, 0.75, 1.85)
load_end = load_start - timer()
print('Loaded', best_model, 'in', load_end)
desired_sr = ds.sampleRate()

Check the audio directory

In [None]:
audio_files = os.listdir(audio_dir)
print(audio_files[:10])

## Resampling function
Inspired by DeepSpeech function "convert_samplerate"

In [None]:
def resample(path, samplerate):
    sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(path), samplerate)
    output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE)
    return np.frombuffer(output, np.int16)

## Inference test
Picks a random audio file and runs an inference on it. You can then listen to the original audio for a comparison!

In [None]:
def run_inference(model, audio):
    audio_wave = wave.open(audio, 'rb')
    desired_samplerate = model.sampleRate()
    if audio_wave.getframerate() != desired_samplerate:
        audio = resample(audio, desired_samplerate)
    else:
        audio = np.frombuffer(audio_wave.readframes(audio_wave.getnframes()), np.int16)
    return model.stt(audio)

In [None]:
random_audio_file = random.choice(audio_files)
random_audio_full = os.path.join(audio_dir, random_audio_file)
    
print(random_audio_file + ': ')
print(run_inference(ds, random_audio_full))
display(ipd.Audio(random_audio_full))

## Create dataset sample

In [None]:
target_dir = '/opt/speech2text/dataset_sample'
if not os.path.exists(target_dir): os.mkdir(target_dir)

In [None]:
num_files = 100
random_files_sample = random.sample(audio_files, num_files)

In [None]:
data_csv = open(os.path.join(target_dir, 'data.csv'), 'w+', encoding='utf-8')
data_csv.write('wav_filename,wav_filesize,transcript' + '\n')

cnt = 1
for f in random_files_sample:
    full_f = os.path.join(audio_dir, f)
    full_target = os.path.join(target_dir, f)
    f_size = os.stat(full_f).st_size
    label = run_inference(ds, full_f)
    data_csv.write(str(full_f) + ',' + str(f_size) + ',' + str(label) + '\n')
    copyfile(full_f, full_target)
    print(cnt, '/', num_files)
    cnt += 1

data_csv.close()