### Run the following cell only once after kernel started

In [1]:
import os
os.chdir('../')

### Make sure you are in "ondewo-t2s-client-python" folder

In [2]:
os.getcwd()

'/home/fcavallin/ondewo'

In [3]:
import io
import soundfile as sf
import IPython.display as ipd
import grpc
from ondewo_grpc.ondewo.t2s import text_to_speech_pb2, text_to_speech_pb2_grpc
import google.protobuf.empty_pb2 as empty_pb2
from google.protobuf.json_format import ParseDict, MessageToDict, MessageToJson

### Set up the parameters of the grpc server. The example below is for the case when server is running locally

In [4]:
MAX_MESSAGE_LENGTH: int = 60000000
GRPC_HOST: str = "localhost"
GRPC_PORT: str = "50555"
CHANNEL: str = f"{GRPC_HOST}:{GRPC_PORT}"

options = [
    ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
    ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
]


channel = grpc.insecure_channel(CHANNEL, options=options)
stab = text_to_speech_pb2_grpc.Text2SpeechStub(channel=channel)

### List all t2s pipelines present on the server

In [5]:
pipelines = stab.ListT2sPipelines(request=empty_pb2.Empty()).pipelines

### Select pipelines for specific language language

In [6]:
def find_pipeline_for_language(pipelines, language):
    for pipeline in pipelines:
        if pipeline.description.language == language:
            return pipeline

In [7]:
english_pipeline = find_pipeline_for_language(pipelines=pipelines, language='en')
german_pipeline = find_pipeline_for_language(pipelines=pipelines, language='de')

In [8]:
pipelines

[id: "glow_tts&hifi_gan-e976dd6c-2f41-484b-aec2-3e6868d37290"
description {
  language: "de"
  speaker_sex: "female"
  pipeline_owner: "ondewo"
  comments: "trained on public domain dataset"
  speaker_name: "Kerstin"
  domain: "general"
}
active: true
inference {
  type: "composite"
  composite_inference {
    text2mel {
      type: "glow_tts"
      glow_tts {
        batch_size: 5
        length_scale: 1.0
        noise_scale: 0.6669999957084656
        path: "models/glow-tts/de/kerstin_blank.pth"
        param_config_path: "models/glow-tts/de/config_blank.json"
      }
      glow_tts_triton {
        batch_size: 8
        length_scale: 1.0
        noise_scale: 0.6669999957084656
        max_text_length: 100
        param_config_path: "models/glow-tts/de/config_blank.json"
        triton_url: "localhost:50511"
        triton_model_name: "glow_tts"
      }
    }
    mel2audio {
      type: "hifi_gan"
      mb_melgan_triton {
        config_path: "models/mb_melgan/en/config.yml"
       

### Make synthesize request to the server to get audio for given text

Configuration message:
    - t2s_pipeline_id: Its required and represents the pipeline id of the model configuration that will be used.
    - length_scale: Its optional and it changes the speed or duration of an audio.
    - noise_scale: Its optional and defines the noise in the generated audio.
    - sample_rate: Its optional and defines the sample rate of the generated file.
    - pcm: Its optional and defines the pulse-code modulation of the file (The number of times per second that samples are taken).

In [9]:
def print_single_info(single_response):
        print(f"Info:")
        print(f"audio_uuid: {single_response.audio_uuid}")
        print(f"generation_time: {single_response.generation_time}")
        print(f"audio_length: {single_response.audio_length}")
        print(f"text: {single_response.text}")
        print(f"config: {single_response.config}")
        bio = io.BytesIO(single_response.audio)
        audio = sf.read(bio, )
        ipd.Audio(audio[0], rate=audio[1])

def print_batch_info(response):
    for idx, single_response in enumerate(response.batch_response):
        print(f"AUDIO {idx}")
        print_single_info(single_response)

In [10]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=german_pipeline.id, length_scale = 1.0, pcm=2, audio_format= 3)
request = text_to_speech_pb2.SynthesizeRequest(text='Hallo, wie geht es dir?', config=config)
response = stab.Synthesize(request=request)

print_single_info(response)

Info:
audio_uuid: 89f7fde9-27fa-46a7-9f00-39a2bfe19782
generation_time: 1.0377438068389893
audio_length: 2.240725517272949
text: Hallo, wie geht es dir?
config: t2s_pipeline_id: "glow_tts&hifi_gan-e976dd6c-2f41-484b-aec2-3e6868d37290"
length_scale: 1.0
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_32
audio_format: mp3



### Adding length scale parameter to make speech faster or slower

In [11]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id, length_scale=0.5)
request = text_to_speech_pb2.SynthesizeRequest(text='Hi, how are you?', config = config)
response = stab.Synthesize(request=request)

print_single_info(response)

Info:
audio_uuid: 6877c4da-f44b-4d08-8691-0aa0aa0cd99f
generation_time: 0.31995001435279846
audio_length: 0.5921088457107544
text: Hi, how are you?
config: t2s_pipeline_id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
length_scale: 0.5
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_16
audio_format: wav



### Make synthesize request to the server to get audios for a Batch of given texts with same configuration

In [12]:
config_1 = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id, length_scale = 1.0, pcm=0, audio_format= 0)

request_1 = text_to_speech_pb2.SynthesizeRequest(text='Hello', config=config_1)
request_2 = text_to_speech_pb2.SynthesizeRequest(text='How are you?', config=config_1)

request = text_to_speech_pb2.BatchSynthesizeRequest(batch_request = [request_1, request_2])

response = stab.BatchSynthesize(request = request)

print_batch_info(response)

AUDIO 0
Info:
audio_uuid: 8ee544dd-6e8d-4995-92de-3c2f0ad68b55
generation_time: 0.3141922056674957
audio_length: 0.5456689596176147
text: Hello
config: t2s_pipeline_id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
length_scale: 1.0
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_16
audio_format: wav

AUDIO 1
Info:
audio_uuid: 393851c7-bf2f-4313-8dc7-35be4d12290b
generation_time: 0.3378311097621918
audio_length: 0.7314285635948181
text: How are you?
config: t2s_pipeline_id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
length_scale: 1.0
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_16
audio_format: wav



### Make synthesize request to the server to get audios for a Batch of given texts with different configurations

In [13]:
config_1 = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id, length_scale = 1.0, pcm=0, audio_format= 0)
config_2 = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id, length_scale = 0.5, pcm=0, audio_format= 1)
config_3 = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=german_pipeline.id, length_scale = 1.0, pcm=1, audio_format= 0)

request_1 = text_to_speech_pb2.SynthesizeRequest(text='Hello', config=config_1)
request_2 = text_to_speech_pb2.SynthesizeRequest(text='How are you?', config=config_2)
request_3 = text_to_speech_pb2.SynthesizeRequest(text='Hallo, wie geht es dir?', config=config_3)

request = text_to_speech_pb2.BatchSynthesizeRequest(batch_request = [request_1, request_2, request_3])

response = stab.BatchSynthesize(request = request)

In [14]:
print_batch_info(response)
           


AUDIO 0
Info:
audio_uuid: 5bb457c3-6289-4975-93f7-82875053452f
generation_time: 0.272760272026062
audio_length: 0.5456689596176147
text: Hello
config: t2s_pipeline_id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
length_scale: 1.0
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_16
audio_format: wav

AUDIO 1
Info:
audio_uuid: caaf533c-7e9f-459e-820b-80b97c70dbb9
generation_time: 0.21691066026687622
audio_length: 0.40634921193122864
text: How are you?
config: t2s_pipeline_id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
length_scale: 0.5
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_16
audio_format: flac

AUDIO 2
Info:
audio_uuid: d856b027-0230-42df-8d8a-2663b3f00213
generation_time: 1.044538974761963
audio_length: 2.240725517272949
text: Hallo, wie geht es dir?
config: t2s_pipeline_id: "glow_tts&hifi_gan-e976dd6c-2f41-484b-aec2-3e6868d37290"
length_scale: 1.0
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_24
audio_format: wa

### Get pipeline you want to update

In [15]:
request = text_to_speech_pb2.T2sPipelineId(id=english_pipeline.id)
pipeline_config = stab.GetT2sPipeline(request=request)

In [16]:
pipeline_config

id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
description {
  language: "en"
  speaker_sex: "female"
  pipeline_owner: "ondewo"
  comments: "trained on public domain dataset Lj_speech"
  speaker_name: "Linda"
  domain: "general"
}
active: true
inference {
  type: "composite"
  composite_inference {
    text2mel {
      type: "glow_tts"
      glow_tts {
        batch_size: 5
        length_scale: 1.0
        noise_scale: 0.6669999957084656
        path: "models/glow-tts/en/pretrained.pth"
        param_config_path: "models/glow-tts/en/config.json"
      }
      glow_tts_triton {
        batch_size: 8
        length_scale: 1.0
        noise_scale: 0.6669999957084656
        max_text_length: 100
        param_config_path: "models/glow-tts/en/config.json"
        triton_url: "localhost:50511"
        triton_model_name: "glow_tts"
      }
    }
    mel2audio {
      type: "hifi_gan"
      mb_melgan_triton {
        config_path: "models/mb_melgan/en/config.yml"
        stats_p

### Change parameter in the pipeline config. For example default length scale

In [17]:
pipeline_config.inference.composite_inference.text2mel.glow_tts.length_scale = 2

### Update pipeline

In [18]:
stab.UpdateT2sPipeline(request=pipeline_config)



### See if generated audio change according to updated config

In [19]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id= english_pipeline.id)
request = text_to_speech_pb2.SynthesizeRequest(text='Hi, how are you?', config = config)
response = stab.Synthesize(request=request)

print_single_info(response)

Info:
audio_uuid: 6018907a-6c32-44a9-a475-0a08568c755f
generation_time: 0.8458008170127869
audio_length: 1.9620862007141113
text: Hi, how are you?
config: t2s_pipeline_id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
length_scale: 2.0
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_16
audio_format: wav



### Change parameter back to previous (length_scale = 1.0)

In [20]:
request = text_to_speech_pb2.T2sPipelineId(id=english_pipeline.id)
pipeline_config = stab.GetT2sPipeline(request=request)
pipeline_config.inference.composite_inference.text2mel.glow_tts.length_scale = 1.0
stab.UpdateT2sPipeline(request=pipeline_config)

print_single_info(response)

Info:
audio_uuid: 6018907a-6c32-44a9-a475-0a08568c755f
generation_time: 0.8458008170127869
audio_length: 1.9620862007141113
text: Hi, how are you?
config: t2s_pipeline_id: "glow_tts&hifi_gan-0118e07a-8447-4c87-9f6b-4802a7dd7a07"
length_scale: 2.0
noise_scale: 0.6669999957084656
sample_rate: 22050
pcm: PCM_16
audio_format: wav

