## **Text2Speech T2S API Tutorial**

This tutorial uses ondewo-t2s-api to:


*   Synthesize a text to audio
*   Synthesize a batch of texts to audios
*   List the possible pipelines that can be used for synthesizing
*   List the possible languages that can be used in the synthesize process
*   List the possible domains
*   Manipulate pipelines (Create, Delete, Update, Get)



Import useful dependencies


In [None]:
import os
import io
import soundfile as sf
import IPython.display as ipd
import grpc
from ondewo_grpc.ondewo.t2s import text_to_speech_pb2, text_to_speech_pb2_grpc
import google.protobuf.empty_pb2 as empty_pb2
from google.protobuf.json_format import ParseDict, MessageToDict, MessageToJson

ModuleNotFoundError: ignored

Make sure you are in "ondewo-t2s-client-python" folder

In [None]:
os.chdir('../')
os.getcwd()

'/'

Set up the parameters of the grpc server. The example below is for the case when server is running locally

In [None]:
MAX_MESSAGE_LENGTH: int = 60000000
GRPC_HOST: str = "localhost"
GRPC_PORT: str = "50555"
CHANNEL: str = f"{GRPC_HOST}:{GRPC_PORT}"

options = [
    ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
    ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
]


channel = grpc.insecure_channel(CHANNEL, options=options)
stab = text_to_speech_pb2_grpc.Text2SpeechStub(channel=channel)

# List all t2s pipelines present on the server

In [None]:
pipelines = stab.ListT2sPipelines(request=empty_pb2.Empty()).pipelines
pipelines

In [None]:
english_pipeline = find_pipeline_for_language(pipelines=pipelines, language='en')
german_pipeline = find_pipeline_for_language(pipelines=pipelines, language='de')

# Make synthesize request to the server to get audio for given text

### Configuration message: 

*   t2s_pipeline_id: Its required and represents the pipeline id of the model configuration that will be used.
*   length_scale: Its optional and it changes the speed or duration of an audio.
*   noise_scale: Its optional and defines the noise in the generated audio.
*   sample_rate: Its optional and defines the sample rate of the generated file.
*   pcm: Its optional and defines the pulse-code modulation of the file (The number of times per second that samples are taken).



In [None]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=german_pipeline.id, length_scale = 1.0, pcm=0, audio_format= 0)
request = text_to_speech_pb2.SynthesizeRequest(text='Hallo, wie geht es dir?', config=config)
response = stab.Synthesize(request=request)

print(f'Length of the generated audio is {response.audio_length} sec.', f'Generation time is {response.generation_time} sec.')
print(f'The text was synthesized with the following configuration \n{config}')

bio = io.BytesIO(response.audio)
audio = sf.read(bio, )
ipd.Audio(audio[0], rate=audio[1])

## Adding length scale parameter to make speech faster or slower

In [None]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id, length_scale=0.5)
request = text_to_speech_pb2.SynthesizeRequest(text='Hi, how are you?', config = config)
response = stab.Synthesize(request=request)

print(f'Length of the generated audio is {response.audio_length} sec.', f'Generation time is {response.generation_time} sec.')
print(f'The text was synthesized with the following configuration \n{response.config}')
bio = io.BytesIO(response.audio)
audio = sf.read(bio)
ipd.Audio(audio[0], rate=audio[1])

## Make synthesize request to the server to get audios for a Batch of given texts with same configuration

In [None]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id, length_scale = 1.0, pcm=0, audio_format= 0)
request = text_to_speech_pb2.BatchSynthesizeRequest(text=['Hello','How are you?','See you later'], config=[config])
batch_response = stab.BatchSynthesize(request=request)

while( hasattr(batch_response, "response") ):
  for response in batch_response.response:
    print(f'Length of the generated audio is {response.audio_length} sec.', f'Generation time is {response.generation_time} sec.')
    print(f'The text was synthesized with the following configuration \n{response.config}')
    bio = io.BytesIO(response.audio)
    audio = sf.read(bio, )
    ipd.Audio(audio[0], rate=audio[1])
    batch_response = response

## Make synthesize request to the server to get audios for a Batch of given texts with different configurations

In [None]:
config_1 = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id, length_scale = 1.0, pcm=0, audio_format= 0)
config_2 = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id, length_scale = 1.0, pcm=0, audio_format= 1)
config_3 = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=german_pipeline.id, length_scale = 1.0, pcm=1, audio_format= 0)
request = text_to_speech_pb2.BatchSynthesizeRequest(text=['Hello','How are you?','Hallo, wie geht es dir?'], config=[config_1,config_2,config_3])
batch_response = stab.BatchSynthesize(request=request)

In [None]:
while( hasattr(batch_response, "response") ):
  for response in batch_response.response:
    print(f'Length of the generated audio is {response.audio_length} sec.', f'Generation time is {response.generation_time} sec.')
    print(f'The text was synthesized with the following configuration \n{response.config}')
    bio = io.BytesIO(response.audio)
    audio = sf.read(bio, )
    ipd.Audio(audio[0], rate=audio[1])
    batch_response = response

## Get pipeline you want to update

In [None]:
request = text_to_speech_pb2.T2sPipelineId(id=english_pipeline.id)
pipeline_config = stab.GetT2sPipeline(request=request)

In [None]:
pipeline_config

## Change parameter in the pipeline config. For example default length scale

pipeline_config.inference.composite_inference.text2mel.glow_tts.length_scale = 2

## Update pipeline

In [None]:
stab.UpdateT2sPipeline(request=pipeline_config)

See if generated audio change according to updated config

In [None]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id= english_pipeline.id)
request = text_to_speech_pb2.SynthesizeRequest(text='Hi, how are you?', config = config)
response = stab.Synthesize(request=request)

print(f'Length of the generated audio is {response.audio_length} sec.', f'Generation time is {response.generation_time} sec.')
print(f'The text was synthesized with the following configuration \n{config}')

bio = io.BytesIO(response.audio)
audio = sf.read(bio)
ipd.Audio(audio[0], rate=audio[1])

## Change parameter back to previous (length_scale = 1.0)

In [None]:
request = text_to_speech_pb2.T2sPipelineId(id=english_pipeline.id)
pipeline_config = stab.GetT2sPipeline(request=request)
pipeline_config.inference.composite_inference.text2mel.glow_tts.length_scale = 1.0
stab.UpdateT2sPipeline(request=pipeline_config)