In [None]:
# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: MIT

# TTS API tutorial

This tutorial demonstates how to use Python Riva API.

## <font color="blue">Server</font>

Before running client part of Riva, please set up a server. The simplest
way to do this is to follow
[quick start guide](https://docs.nvidia.com/deeplearning/riva/user-guide/docs/quick-start-guide.html#local-deployment-using-quick-start-scripts).


## <font color="blue">Authentication</font>

Before using Riva services you will need to establish connection with a server.

In [None]:
import riva.client

uri = "localhost:50051"  # Default value

auth = riva.client.Auth(uri=uri)

## <font color="blue">Setting up service</font>

To instantiate a service pass `riva.client.Auth` instance to a constructor.

In [None]:
tts_service = riva.client.SpeechSynthesisService(auth)

## <font color="blue">Offline synthesis</font>

In offline mode a result is returned in one response.

In [None]:
language_code = 'en-US'
sample_rate_hz = 16000
nchannels = 1
sampwidth = 2
text = (
    "The United States of America, commonly known as the United States or America, "
    "is a country primarily located in North America. It consists of 50 states, "
    "a federal district, five major unincorporated territories, 326 Indian reservations, "
    "and nine minor outlying islands."
)

In [None]:
resp = tts_service.synthesize(text, language_code=language_code, sample_rate_hz=sample_rate_hz)

In [None]:
audio = resp.audio
meta = resp.meta

In [None]:
print(len(audio))

In [None]:
processed_text = meta.processed_text
predicted_durations = meta.predicted_durations

In [None]:
print(processed_text)

In [None]:
print(len(predicted_durations))
print(predicted_durations[0])

Now we can write audio to a file.

In [None]:
import wave
offline_output_file = "my_offline_synthesized_speech.wav"
with wave.open(offline_output_file, 'wb') as out_f:
    out_f.setnchannels(nchannels)
    out_f.setsampwidth(sampwidth)
    out_f.setframerate(sample_rate_hz)
    out_f.writeframesraw(resp.audio)

In [None]:
import IPython
IPython.display.Audio(offline_output_file)

### <font color="green">Asynchronous calls</font>

You can perform speech synthesis in an asynchronous manner by setting parameter `future=True`. In such case, `SpeechRecognitionService.synthesize()` will return a future object. You may get final response from a future object by calling `result()` method.

In [None]:
from time import time
num_repeats = 10

In [None]:
start_time = time()
sync_audio = []
for _ in range(num_repeats):
    sync_audio.append(
        tts_service.synthesize(text, language_code=language_code, sample_rate_hz=sample_rate_hz).audio
    )
print(f"Synchronous calls time: {time() - start_time:.2f}")

In [None]:
start_time = time()
async_audio = []
futures = []
for _ in range(num_repeats):
    futures.append(
        tts_service.synthesize(
            text, language_code=language_code, sample_rate_hz=sample_rate_hz, future=True
        )
    )
for f in futures:
    async_audio.append(f.result().audio)
print(f"Async calls time: {time() - start_time:.2f}")

## <font color="blue">Streaming synthesis</font>

In streaming mode an audio is returned in several responses. Responses are returned as soon as audio chunk is ready.

In [None]:
responses = tts_service.synthesize_online(text, language_code=language_code, sample_rate_hz=sample_rate_hz)

In [None]:
streaming_audio = b''
for resp in responses:
    streaming_audio += resp.audio

In [None]:
import wave
streaming_output_file = "my_streaming_synthesized_speech.wav"
with wave.open(streaming_output_file, 'wb') as out_f:
    out_f.setnchannels(nchannels)
    out_f.setsampwidth(sampwidth)
    out_f.setframerate(sample_rate_hz)
    out_f.writeframesraw(streaming_audio)

In [None]:
import IPython
IPython.display.Audio(streaming_output_file)

## <font color="blue">Audio output</font>

For using audio input and output you need to install PyAudio.

```bash
conda install -c anaconda pyaudio
```

### <font color="green">Playing audio during synthesis</font>

For playing audio during synthesis you will need to pass audio chunks to `riva.client.audio_io.SoundCallBack` as they arrive.

In [None]:
import riva.client.audio_io
# show available output devices
riva.client.audio_io.list_output_devices()

In [None]:
output_device = None  # use default device
sound_stream = riva.client.audio_io.SoundCallBack(
    output_device, nchannels=nchannels, sampwidth=sampwidth, framerate=sample_rate_hz
)
for resp in tts_service.synthesize_online(text, language_code=language_code, sample_rate_hz=sample_rate_hz):
    sound_stream(resp.audio)