In [1]:
from pydub import AudioSegment
from IPython.display import Audio, display
from ipywebrtc import CameraStream, AudioRecorder
import json
import ipywidgets
from nemo.jasper_infer import initialize_model

modules = initialize_model(
    model_config='/code/hebrew-stt/jasper_checkpoints/quartznet_orig/quartznet15x5_custom.yaml',
    load_dir='/code/hebrew-stt/jasper_checkpoints/quartznet_multi_gpu_31.84_WER',
    amp_opt_level='O0',
    lm_path='/code/corpus/combined.wiki.opensubtitles.bin',
    alpha=0.72,
    beta=0.8,
    beam_width=128,
)

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



[NeMo I 2020-07-09 19:37:34 features:144] PADDING: 16
[NeMo I 2020-07-09 19:37:34 features:152] STFT using conv
[NeMo I 2020-07-09 19:38:18 jasper_infer:99] Number of parameters in encoder: 18894656
[NeMo I 2020-07-09 19:38:18 jasper_infer:100] Number of parameters in decoder: 25625
[NeMo I 2020-07-09 19:38:18 jasper_infer:101] Total number of parameters in model: 18920281


In [10]:
import nemo.jasper_infer as jasper_infer
import importlib
import traceback
import ipywidgets
importlib.reload(jasper_infer)


def render_result(title, value):
    result = f'''
    <div style="direction: rtl">
        <h3 style="font-weight: bold;">{title}</h3>
        <div style="font-size: 1.4em">{value if len(value) else "~~~ EMPTY ~~~"}</div>
    </div>'''
    return result


def on_change(change):
    error.value = ''
    
    try:
        greedy_result.value = render_result('Greedy decoding', '~~~ LOADING ~~~')
        beam_search_result.value = render_result('Beam search decoding', '~~~ LOADING ~~~')


        recorder.save('/tmp/recording.wav')
        !ffmpeg -hide_banner -loglevel panic -y -i /tmp/recording.wav -ar 16000 /tmp/recording_16k.wav
        !cp "/tmp/recording_16k.wav" "recordings/recording.$(ls recordings | wc -l).$(date +%Y-%m-%d_%H-%M-%S).wav"

        infer()
    except Exception as e:
        tb = traceback.format_exc()
        error.value = f'''<pre>{tb}</pre>'''

        
def infer():
    try:
        result = jasper_infer.infer(modules, '/tmp/recording_16k.wav')
        greedy_result.value = render_result('Greedy decoding', result['greedy'])
        beam_search_result.value = render_result('Beam search decoding', result['beam'])
    except Exception as e:
        tb = traceback.format_exc()
        error.value = f'''<pre>{tb}</pre>'''
        
    
error = ipywidgets.HTML()
display(error)
    
greedy_result = ipywidgets.HTML()
display(greedy_result)

beam_search_result = ipywidgets.HTML()
display(beam_search_result)

greedy_result.value = render_result('Greedy decoding', '')
beam_search_result.value = render_result('Beam search decoding', '')
    

camera = CameraStream(constraints={'audio': { 'sample_rate': 16000 },
                                   'video': False})
recorder = AudioRecorder(stream=camera)   
recorder.audio.observe(on_change, 'value')
display(recorder)

reinfer_button = ipywidgets.Button(description='Redo inference')
reinfer_button.on_click(infer)
display(reinfer_button)

HTML(value='')

HTML(value='')

HTML(value='')

AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': {'sample_rate':…

Button(description='Redo inference', style=ButtonStyle())

In [3]:
# recorder.save('/tmp/recording.wav')
# !ffmpeg -hide_banner -loglevel panic -y -i /tmp/recording.wav -ar 16000 /tmp/recording_16k.wav
# !KALDI_ROOT=/tmp python /code/nemo/nemo/jasper_infer.py \
# --input_wav /tmp/recording_16k.wav \
# --model_config /code/hebrew-stt/jasper_checkpoints/quartznet_orig/quartznet15x5_custom.yaml \
# --load_dir /code/hebrew-stt/jasper_checkpoints/quartznet_multi_gpu_31.84_WER \
# --amp_opt_level O0 \
# --lm_path /code/hebrew-stt/lm/wiki.bin \
# --alpha 0.76 \
# --beta 0.72 \
# ;