In [None]:
# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: MIT

# ASR API tutorial

This tutorial demonstates how to use Python Riva API.

## <font color="blue">Server</font>

Before running client part of Riva, please set up a server. The simplest
way to do this is to follow
[quick start guide](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/quick-start-guide.html#local-deployment-using-quick-start-scripts).


## <font color="blue">Authentication</font>

Before using Riva services you will need to establish connection with a server.

In [None]:
import riva.client

uri = "localhost:50051"  # Default value

auth = riva.client.Auth(uri=uri)

## <font color="blue">Setting up service</font>

To instantiate a service pass `riva.client.Auth` instance to a constructor.

In [None]:
asr_service = riva.client.ASRService(auth)

For speech recognition you will need to create a recognition config (an instance of `riva.client.RecognitionConfig`). 
A detailed description of config fields is available in Riva 
[documentation](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/reference/protos/riva_asr.proto.html?highlight=max%20alternatives#riva-proto-riva-asr-proto).
If you intend to use streaming recognition, an offline config has to wrapped into `riva.client.StreamingRecognitionConfig`.


In [None]:
from copy import deepcopy
offline_config = riva.client.RecognitionConfig(
    encoding=riva.client.AudioEncoding.LINEAR_PCM,
    max_alternatives=1,
    enable_automatic_punctuation=True,
    verbatim_transcripts=False,
)
streaming_config = riva.client.StreamingRecognitionConfig(config=deepcopy(offline_config), interim_results=True)

You also need to a set frame rate and number of channels of audio which is going to be processed. If you'd like to process file `data/examples/en-US_AntiBERTa_for_word_boosting_testing.wav`, then your code will be

In [None]:
my_wav_file = '../data/examples/en-US_AntiBERTa_for_word_boosting_testing.wav'
riva.client.add_audio_file_specs_to_config(offline_config, my_wav_file)
riva.client.add_audio_file_specs_to_config(streaming_config, my_wav_file)

If you intent to use word boosting, then use convenience method `riva.client.add_word_boosting_to_config()` to add boosting parameters to config.

In [None]:
boosted_lm_words = ['AntiBERTa', 'ABlooper']
boosted_lm_score = 20.0
riva.client.add_word_boosting_to_config(offline_config, boosted_lm_words, boosted_lm_score)
riva.client.add_word_boosting_to_config(streaming_config, boosted_lm_words, boosted_lm_score)

In [None]:
print(offline_config)

In [None]:
print(streaming_config)

## <font color="blue">Offline</font>

To run offline speech recognition read data from a file and pass to a service.

In [None]:
with open(my_wav_file, 'rb') as fh:
    data = fh.read()

In [None]:
response = asr_service.offline_recognize(data, offline_config)

In [None]:
print(response)

To extract a transcript you may use

In [None]:
print(response.results[0].alternatives[0].transcript)

In [None]:
print(response.results[0].alternatives[0].confidence)

### <font color="green">Asynchronous calls</font>

You can recognize speech asynchronously by setting `future=True` in `ASRService.offline_recognize()`.

In [None]:
from time import time

num_repeats = 10

In [None]:
sync_transcripts = []
start_time = time()
for _ in range(num_repeats):
    sync_transcripts.append(
        asr_service.offline_recognize(data, offline_config).results[0].alternatives[0].transcript
    )
print(f"Time spent on synchronous recognition: {time() - start_time:.2f}")

In [None]:
async_transcripts = []
start_time = time()
futures = []
for _ in range(num_repeats):
    futures.append(asr_service.offline_recognize(data, offline_config, future=True))
for f in futures:
    async_transcripts.append(f.result().results[0].alternatives[0].transcript)
print(f"Time spent on async recognition: {time() - start_time:.2f}")

In [None]:
assert sync_transcripts == async_transcripts

## <font color="blue">Streaming</font>

To imitate audio streaming use `riva.client.AudioChunkFileIterator`. You can imitate realtime audio by providing a delay callback to the iterator.

In [None]:
wav_parameters = riva.client.get_wav_file_parameters(my_wav_file)
# correponds to 1 second of audio
chunk_size = wav_parameters['framerate']
with riva.client.AudioChunkFileIterator(
    my_wav_file, chunk_size, delay_callback=riva.client.sleep_audio_length,
) as audio_chunk_iterator:
    for i, chunk in enumerate(audio_chunk_iterator):
        print(i, len(chunk))

Then audio chunks are passed to `ASRService.streaming_response_generator()` and response generator is created.

In [None]:
audio_chunk_iterator = riva.client.AudioChunkFileIterator(my_wav_file, 4800)
response_generator = asr_service.streaming_response_generator(audio_chunk_iterator, streaming_config)

You may find description of streaming response (`StreamingRecognizeResponse`) fields in Riva [documentation](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/reference/protos/riva_asr.proto.html?highlight=max%20alternatives#riva-proto-riva-asr-proto).

In [None]:
streaming_response = next(response_generator)

For showing streaming results it is convenient to use function `riva.client.print_streaming()`.

In [None]:
riva.client.print_streaming(response_generator, additional_info='time')

If you set a delay callback in audio chunk iterator and `show_intermediate=True` in `riva.client.print_streaming()`, then you will be able watch transcript forming.

In [None]:
audio_chunk_iterator = riva.client.AudioChunkFileIterator(my_wav_file, 4800, riva.client.sleep_audio_length)
response_generator = asr_service.streaming_response_generator(audio_chunk_iterator, streaming_config)
riva.client.print_streaming(response_generator, show_intermediate=True)

It is also possible to print streaming results in several places, e.g. in STDOUT and a file.

In [None]:
import sys
output_file = "my_results.txt"
audio_chunk_iterator = riva.client.AudioChunkFileIterator(my_wav_file, 4800)
response_generator = asr_service.streaming_response_generator(audio_chunk_iterator, streaming_config)
riva.client.print_streaming(response_generator, additional_info='confidence', output_file=[sys.stdout, output_file])

Showing file and clean up in bash

In [None]:
!cat $output_file

In [None]:
!rm $output_file

Showing file and clean up in cmd.exe

In [None]:
!type $output_file

In [None]:
!del $output_file

## <font color="blue">Audio input/output</font>

For using audio input and output you need to install PyAudio.

```bash
conda install -c anaconda pyaudio
```

### <font color="green">Playing audio during transcribing</font>

For playing audio simultaneously with transcribing, provide an instance of `riva.client.audio_io.SoundCallBack` as a `delay_callback` to `riva.client.AudioChunkFileIterator`.

In [None]:
import riva.client.audio_io

In [None]:
# show available output devices
riva.client.audio_io.list_output_devices()

In [None]:
output_device = None  # use default device
wav_parameters = riva.client.get_wav_file_parameters(my_wav_file)
sound_callback = riva.client.audio_io.SoundCallBack(
    output_device, wav_parameters['sampwidth'], wav_parameters['nchannels'], wav_parameters['framerate'],
)
audio_chunk_iterator = riva.client.AudioChunkFileIterator(my_wav_file, 4800, sound_callback)
response_generator = asr_service.streaming_response_generator(audio_chunk_iterator, streaming_config)
riva.client.print_streaming(response_generator, show_intermediate=True)
sound_callback.close()

### <font color="green">Streaming from microphone</font>

In [None]:
riva.client.audio_io.list_input_devices()

Run code below and then say something in English

In [None]:
input_device = None  # default device
with riva.client.audio_io.MicrophoneStream(
    rate=streaming_config.config.sample_rate_hertz,
    chunk=streaming_config.config.sample_rate_hertz // 10,
    device=input_device,
) as audio_chunk_iterator:
    riva.client.print_streaming(
        responses=asr_service.streaming_response_generator(
            audio_chunks=audio_chunk_iterator,
            streaming_config=streaming_config,
        ),
        show_intermediate=True,
    )