### Run the following cell only once after kernel started

In [21]:
import os
os.chdir('../')

### Make sure you are in "ondewo-t2s-client-python" folder

In [22]:
os.getcwd()

'/home/fcavallin/ondewo'

In [23]:
import io
import soundfile as sf
import IPython.display as ipd
import grpc
from ondewo_grpc.ondewo.t2s import text_to_speech_pb2, text_to_speech_pb2_grpc
import google.protobuf.empty_pb2 as empty_pb2
from google.protobuf.json_format import ParseDict, MessageToDict, MessageToJson

### Set up the parameters of the grpc server. The example below is for the case when server is running locally

In [24]:
MAX_MESSAGE_LENGTH: int = 60000000
GRPC_HOST: str = "localhost"
GRPC_PORT: str = "50555"
CHANNEL: str = f"{GRPC_HOST}:{GRPC_PORT}"

options = [
    ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
    ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
]


channel = grpc.insecure_channel(CHANNEL, options=options)
stab = text_to_speech_pb2_grpc.Text2SpeechStub(channel=channel)

### List all t2s pipelines present on the server

In [26]:
pipelines = stab.ListT2sPipelines(request=empty_pb2.Empty()).pipelines

### Select pipelines for specific language language

In [27]:
def find_pipeline_for_language(pipelines, language):
    for pipeline in pipelines:
        if pipeline.description.language == language:
            return pipeline

In [28]:
english_pipeline = find_pipeline_for_language(pipelines=pipelines, language='en')
german_pipeline = find_pipeline_for_language(pipelines=pipelines, language='de')

In [29]:
pipelines

[id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
description {
  language: "en"
  speaker_sex: "female"
  pipeline_owner: "ondewo"
  comments: "trained on public domain dataset Lj_speech"
  speaker_name: "Linda"
  domain: "general"
}
active: true
inference {
  type: "composite"
  composite_inference {
    text2mel {
      type: "glow_tts"
      glow_tts {
        batch_size: 5
        length_scale: 1.0
        noise_scale: 0.6669999957084656
        path: "models/glow-tts/en/pretrained.pth"
        param_config_path: "models/glow-tts/en/config.json"
      }
      glow_tts_triton {
        batch_size: 8
        length_scale: 1.0
        noise_scale: 0.6669999957084656
        max_text_length: 100
        param_config_path: "models/glow-tts/en/config.json"
        triton_url: "localhost:50511"
        triton_model_name: "glow_tts"
      }
    }
    mel2audio {
      type: "hifi_gan"
      mb_melgan_triton {
        config_path: "models/mb_melgan/en/config.yml"
        stats_

### Make synthesize request to the server to get audio for given text

Configuration message:
    - t2s_pipeline_id: Its required and represents the pipeline id of the model configuration that will be used.
    - length_scale: Its optional and it changes the speed or duration of an audio.
    - noise_scale: Its optional and defines the noise in the generated audio.
    - sample_rate: Its optional and defines the sample rate of the generated file.
    - pcm: Its optional and defines the pulse-code modulation of the file (The number of times per second that samples are taken).

In [30]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=german_pipeline.id, length_scale = 1.0, pcm=0, audio_format= 0)
request = text_to_speech_pb2.SynthesizeRequest(text='Hallo, wie geht es dir?', config=config)
response = stab.Synthesize(request=request)

print(f'Length of the generated audio is {response.audio_length} sec.', f'Generation time is {response.generation_time} sec.')
print(f'The text was synthesized with the following configuration \n{config}')

bio = io.BytesIO(response.audio)

audio = sf.read(bio, )

ipd.Audio(audio[0], rate=audio[1])

Length of the generated audio is 2.240725517272949 sec. Generation time is 1.2438806295394897 sec.
The text was synthesized with the following configuration 
t2s_pipeline_id: "glow_tts&hifi_gan-e976dd6c-2f41-484b-aec2-3e6868d37290"
length_scale: 1.0
pcm: PCM_16
audio_format: wav



### Adding length scale parameter to make speech faster or slower

In [10]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id, length_scale=0.5)
request = text_to_speech_pb2.SynthesizeRequest(text='Hi, how are you?', config = config)
response = stab.Synthesize(request=request)

print(f'Length of the generated audio is {response.audio_length} sec.', f'Generation time is {response.generation_time} sec.')
print(f'The text was synthesized with the following configuration \n{response.config}')

bio = io.BytesIO(response.audio)

audio = sf.read(bio)

ipd.Audio(audio[0], rate=audio[1])

Length of the generated audio is 0.5921088457107544 sec. Generation time is 0.357820600271225 sec.
The text was synthesized with the following configuration 
t2s_pipeline_id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
length_scale: 0.5
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_16
audio_format: wav



### Make synthesize request to the server to get audios for a Batch of given texts with same configuration

In [32]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id, length_scale = 1.0, pcm=0, audio_format= 0)
request = text_to_speech_pb2.BatchSynthesizeRequest(text=['Hello','How are you?','See you later'], config=[config])
batch_response = stab.BatchSynthesize(request=request)



In [33]:
while( hasattr(batch_response, "response") ):
  for response in batch_response.response:
    print(f'Length of the generated audio is {response.audio_length} sec.', f'Generation time is {response.generation_time} sec.')
    print(f'The text was synthesized with the following configuration \n{response.config}')
    bio = io.BytesIO(response.audio)
    audio = sf.read(bio, )
    ipd.Audio(audio[0], rate=audio[1])
    batch_response = response

Length of the generated audio is 0.5456689596176147 sec. Generation time is 0.414970725774765 sec.
The text was synthesized with the following configuration 
t2s_pipeline_id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
length_scale: 1.0
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_16
audio_format: wav

Length of the generated audio is 0.7314285635948181 sec. Generation time is 0.7611852884292603 sec.
The text was synthesized with the following configuration 
t2s_pipeline_id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
length_scale: 1.0
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_16
audio_format: wav

Length of the generated audio is 0.9868480563163757 sec. Generation time is 0.4928921163082123 sec.
The text was synthesized with the following configuration 
t2s_pipeline_id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
length_scale: 1.0
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_16
audio_format: wav



### Make synthesize request to the server to get audios for a Batch of given texts with different configurations

In [34]:
config_1 = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id, length_scale = 1.0, pcm=0, audio_format= 0)
config_2 = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id, length_scale = 1.0, pcm=0, audio_format= 1)
config_3 = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=german_pipeline.id, length_scale = 1.0, pcm=1, audio_format= 0)
request = text_to_speech_pb2.BatchSynthesizeRequest(text=['Hello','How are you?','Hallo, wie geht es dir?'], config=[config_1,config_2,config_3])
batch_response = stab.BatchSynthesize(request=request)

In [35]:
while( hasattr(batch_response, "response") ):
  for response in batch_response.response:
    print(f'Length of the generated audio is {response.audio_length} sec.', f'Generation time is {response.generation_time} sec.')
    print(f'The text was synthesized with the following configuration \n{response.config}')
    bio = io.BytesIO(response.audio)
    audio = sf.read(bio, )
    ipd.Audio(audio[0], rate=audio[1])
    batch_response = response

Length of the generated audio is 0.5456689596176147 sec. Generation time is 0.9829276204109192 sec.
The text was synthesized with the following configuration 
t2s_pipeline_id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
length_scale: 1.0
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_16
audio_format: wav

Length of the generated audio is 0.7314285635948181 sec. Generation time is 0.5627701878547668 sec.
The text was synthesized with the following configuration 
t2s_pipeline_id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
length_scale: 1.0
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_16
audio_format: flac

Length of the generated audio is 2.240725517272949 sec. Generation time is 1.5596107244491577 sec.
The text was synthesized with the following configuration 
t2s_pipeline_id: "glow_tts&hifi_gan-e976dd6c-2f41-484b-aec2-3e6868d37290"
length_scale: 1.0
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_24
audio_format: wav



### Get pipeline you want to update

In [36]:
request = text_to_speech_pb2.T2sPipelineId(id=english_pipeline.id)
pipeline_config = stab.GetT2sPipeline(request=request)

In [37]:
pipeline_config

id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
description {
  language: "en"
  speaker_sex: "female"
  pipeline_owner: "ondewo"
  comments: "trained on public domain dataset Lj_speech"
  speaker_name: "Linda"
  domain: "general"
}
active: true
inference {
  type: "composite"
  composite_inference {
    text2mel {
      type: "glow_tts"
      glow_tts {
        batch_size: 5
        length_scale: 1.0
        noise_scale: 0.6669999957084656
        path: "models/glow-tts/en/pretrained.pth"
        param_config_path: "models/glow-tts/en/config.json"
      }
      glow_tts_triton {
        batch_size: 8
        length_scale: 1.0
        noise_scale: 0.6669999957084656
        max_text_length: 100
        param_config_path: "models/glow-tts/en/config.json"
        triton_url: "localhost:50511"
        triton_model_name: "glow_tts"
      }
    }
    mel2audio {
      type: "hifi_gan"
      mb_melgan_triton {
        config_path: "models/mb_melgan/en/config.yml"
        stats_p

### Change parameter in the pipeline config. For example default length scale

In [17]:
pipeline_config.inference.composite_inference.text2mel.glow_tts.length_scale = 2

### Update pipeline

In [18]:
stab.UpdateT2sPipeline(request=pipeline_config)



### See if generated audio change according to updated config

In [19]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id= english_pipeline.id)
request = text_to_speech_pb2.SynthesizeRequest(text='Hi, how are you?', config = config)
response = stab.Synthesize(request=request)

print(f'Length of the generated audio is {response.audio_length} sec.', f'Generation time is {response.generation_time} sec.')
print(f'The text was synthesized with the following configuration \n{config}')


bio = io.BytesIO(response.audio)

audio = sf.read(bio)

ipd.Audio(audio[0], rate=audio[1])

Length of the generated audio is 1.9620862007141113 sec. Generation time is 0.9231884479522705 sec.
The text was synthesized with the following configuration 
t2s_pipeline_id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"



### Change parameter back to previous (length_scale = 1.0)

In [20]:
request = text_to_speech_pb2.T2sPipelineId(id=english_pipeline.id)
pipeline_config = stab.GetT2sPipeline(request=request)
pipeline_config.inference.composite_inference.text2mel.glow_tts.length_scale = 1.0
stab.UpdateT2sPipeline(request=pipeline_config)

