### Run the following cell only once after kernel started

In [1]:
import os
os.chdir('../')

### Make sure you are in "ondewo-t2s-client-python" folder

In [2]:
os.getcwd()

'/home/fcavallin/ondewo/ondewo-t2s'

In [3]:
import io
import soundfile as sf
import IPython.display as ipd
import grpc
from ondewo_grpc.ondewo.t2s import text_to_speech_pb2, text_to_speech_pb2_grpc
import google.protobuf.empty_pb2 as empty_pb2
from google.protobuf.json_format import ParseDict, MessageToDict, MessageToJson

### Set up the parameters of the grpc server. The example below is for the case when server is running locally

In [4]:
MAX_MESSAGE_LENGTH: int = 60000000
GRPC_HOST: str = "localhost"
GRPC_PORT: str = "50555"
CHANNEL: str = f"{GRPC_HOST}:{GRPC_PORT}"

options = [
    ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
    ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
]

channel = grpc.insecure_channel(CHANNEL, options=options)

stab = text_to_speech_pb2_grpc.Text2SpeechStub(channel=channel)


### List all t2s pipelines present on the server

In [5]:
pipelines = stab.ListT2sPipelines(request=empty_pb2.Empty()).pipelines

### Select pipelines for specific language language

In [6]:
def find_pipeline_for_language(pipelines, language):
    for pipeline in pipelines:
        if pipeline.description.language == language:
            return pipeline

In [7]:
english_pipeline = find_pipeline_for_language(pipelines=pipelines, language='en')
german_pipeline = find_pipeline_for_language(pipelines=pipelines, language='de')

In [8]:
pipelines

[id: "glow_tts&hifi_gan-e976dd6c-2f41-484b-aec2-3e6868d37290"
description {
  language: "de"
  speaker_sex: "female"
  pipeline_owner: "ondewo"
  comments: "trained on public domain dataset"
  speaker_name: "Kerstin"
  domain: "general"
}
active: true
inference {
  type: "composite"
  composite_inference {
    text2mel {
      type: "glow_tts"
      glow_tts {
        batch_size: 5
        length_scale: 1.0
        noise_scale: 0.6669999957084656
        path: "models/glow-tts/de/kerstin_blank.pth"
        param_config_path: "models/glow-tts/de/config_blank.json"
      }
      glow_tts_triton {
        batch_size: 8
        length_scale: 1.0
        noise_scale: 0.6669999957084656
        max_text_length: 100
        param_config_path: "models/glow-tts/de/config_blank.json"
        triton_url: "localhost:50511"
        triton_model_name: "glow_tts"
      }
    }
    mel2audio {
      type: "hifi_gan"
      mb_melgan_triton {
        config_path: "models/mb_melgan/en/config.yml"
       

### Make synthesize request to the server to get audio for given text

In [9]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=german_pipeline.id, # hgjhg jh
                                          length_scale=1.0, 
                                          pcm=0, 
                                          audio_format=text_to_speech_pb2.AudioFormat.wav)
request = text_to_speech_pb2.SynthesizeRequest(text='magic_word', 
                                               config=config)
response = stab.Synthesize(request=request)

In [10]:
response

audio_uuid: "3b27e0ad-44ba-48ad-94fc-a108c0b1aab3"
audio: "RIFF$\026\001\000WAVEfmt \020\000\000\000\001\000\001\000\"V\000\000D\254\000\000\002\000\020\000data\000\026\001\000\377\377\007\000\000\000\000\000\010\000\004\000\013\000\005\000\013\000\013\000\t\000\t\000\022\000\014\000\t\000\006\000\010\000\010\000\006\000\004\000\t\000\006\000\004\000\005\000\007\000\007\000\010\000\005\000\007\000\n\000\004\000\004\000\r\000\005\000\006\000\003\000\004\000\t\000\006\000\005\000\010\000\003\000\n\000\004\000\006\000\004\000\001\000\006\000\t\000\007\000\007\000\n\000\r\000\007\000\010\000\t\000\006\000\010\000\010\000\013\000\017\000\014\000\014\000\004\000\t\000\005\000\006\000\013\000\t\000\t\000\n\000\013\000\007\000\005\000\t\000\004\000\007\000\t\000\007\000\005\000\n\000\016\000\t\000\014\000\014\000\016\000\022\000\016\000\022\000\027\000\025\000\020\000\020\000\025\000\026\000\023\000\022\000\025\000\031\000\021\000\026\000\032\000\024\000\r\000\023\000\021\000\026\000\024\000\0

In [11]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=german_pipeline.id, length_scale = 1.0, pcm=0, audio_format= 0)
request = text_to_speech_pb2.SynthesizeRequest(text='magic_word', config=config)
response = stab.Synthesize(request=request)
print(f'Length of the generated audio is {response.audio_length} sec.', f'Generation time is {response.generation_time} sec.')

bio = io.BytesIO(response.audio)

audio = sf.read(bio, )

ipd.Audio(audio[0], rate=audio[1])

Length of the generated audio is 1.6137868165969849 sec. Generation time is 0.7160027623176575 sec.


In [12]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=german_pipeline.id, length_scale = 1.0)
request = text_to_speech_pb2.SynthesizeRequest(text='Masada ist eine archäologische Stätte in Israel. Auf einem Gipfelplateau am Rand der Judäischen Wüste', config=config)
response = stab.Synthesize(request=request)

print(f'Length of the generated audio is {response.audio_length} sec.', f'Generation time is {response.generation_time} sec.')

bio = io.BytesIO(response.audio)

audio = sf.read(bio)

ipd.Audio(audio[0], rate=audio[1])

Length of the generated audio is 9.073559761047363 sec. Generation time is 2.945343494415283 sec.


### Adding length scale parameter to make speech faster or slower

In [13]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id)
request = text_to_speech_pb2.SynthesizeRequest(text='Hi, how are you?', config=config)
response = stab.Synthesize(request=request)

print(f'Length of the generated audio is {response.audio_length} sec.', f'Generation time is {response.generation_time} sec.')

bio = io.BytesIO(response.audio)

audio = sf.read(bio)

ipd.Audio(audio[0], rate=audio[1])

Length of the generated audio is 1.05650794506073 sec. Generation time is 0.5147445797920227 sec.


### Get pipeline you want to update

In [14]:
request = text_to_speech_pb2.T2sPipelineId(id=english_pipeline.id)

In [15]:
request = text_to_speech_pb2.T2sPipelineId(id=english_pipeline.id)
pipeline_config = stab.GetT2sPipeline(request=request)

### Change parameter in the pipeline config. For example default length scale

In [16]:
pipeline_config.inference.composite_inference.text2mel.glow_tts.length_scale = 2

In [17]:
pipeline_config

id: "glow_tts&hifi_gan-e976dd6c-2f41-484b-aec2-3e6868d37280"
description {
  language: "en"
  speaker_sex: "female"
  pipeline_owner: "ondewo"
  comments: "trained on public domain dataset Lj_speech"
  speaker_name: "Linda"
  domain: "general"
}
active: true
inference {
  type: "composite"
  composite_inference {
    text2mel {
      type: "glow_tts"
      glow_tts {
        batch_size: 5
        length_scale: 2.0
        noise_scale: 0.6669999957084656
        path: "models/glow-tts/en/pretrained.pth"
        param_config_path: "models/glow-tts/en/config.json"
      }
      glow_tts_triton {
        batch_size: 8
        length_scale: 1.0
        noise_scale: 0.6669999957084656
        max_text_length: 100
        param_config_path: "models/glow-tts/en/config.json"
        triton_url: "localhost:50511"
        triton_model_name: "glow_tts"
      }
    }
    mel2audio {
      type: "hifi_gan"
      mb_melgan_triton {
        config_path: "models/mb_melgan/en/config.yml"
        stats_p

### Update pipeline

In [18]:
r = text_to_speech_pb2.ListT2sPipelinesRequest(speaker_sexes=0)
l = stab.ListT2sPipelines(request=r)
print(l)

pipelines {
  id: "glow_tts&hifi_gan-e976dd6c-2f41-484b-aec2-3e6868d37290"
  description {
    language: "de"
    speaker_sex: "female"
    pipeline_owner: "ondewo"
    comments: "trained on public domain dataset"
    speaker_name: "Kerstin"
    domain: "general"
  }
  active: true
  inference {
    type: "composite"
    composite_inference {
      text2mel {
        type: "glow_tts"
        glow_tts {
          batch_size: 5
          length_scale: 1.0
          noise_scale: 0.6669999957084656
          path: "models/glow-tts/de/kerstin_blank.pth"
          param_config_path: "models/glow-tts/de/config_blank.json"
        }
        glow_tts_triton {
          batch_size: 8
          length_scale: 1.0
          noise_scale: 0.6669999957084656
          max_text_length: 100
          param_config_path: "models/glow-tts/de/config_blank.json"
          triton_url: "localhost:50511"
          triton_model_name: "glow_tts"
        }
      }
      mel2audio {
        type: "hifi_gan"
       

### See if generated audio change according to updated config

In [19]:
stab.UpdateT2sPipeline(request=pipeline_config)



In [20]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id)
request = text_to_speech_pb2.SynthesizeRequest(text='Hi, how are you?', config=config)
response = stab.Synthesize(request=request)

In [21]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id)
request = text_to_speech_pb2.SynthesizeRequest(text='Hi, how are you?', config=config)
response = stab.Synthesize(request=request)

print(f'Length of the generated audio is {response.audio_length} sec.', f'Generation time is {response.generation_time} sec.')

bio = io.BytesIO(response.audio)

audio = sf.read(bio)

ipd.Audio(audio[0], rate=audio[1])

Length of the generated audio is 1.9620862007141113 sec. Generation time is 0.8316806554794312 sec.


### Change parameter back to previous (length_scale = 1.0)

In [22]:
request = text_to_speech_pb2.T2sPipelineId(id=english_pipeline.id)
pipeline_config = stab.GetT2sPipeline(request=request)
pipeline_config.inference.composite_inference.text2mel.glow_tts.length_scale = 1.0
stab.UpdateT2sPipeline(request=pipeline_config)



In [23]:
l = ['f', 'k']
from uuid import UUID

In [24]:
abs(hash(str(l)))

443647944621644862

In [25]:
import torch


In [26]:
torch.__version__


'1.7.0'

In [27]:
config = text_to_speech_pb2.RequestConfig(t2s_pipeline_id=english_pipeline.id)
request = text_to_speech_pb2.BatchSynthesizeRequest(text=['Hola','Chau'], config=[config])
response = stab.BatchSynthesize(request=request)

In [28]:
response

response {
  audio_uuid: "107ec1ab-c995-43c3-9c27-3674e62fed0e"
  audio: "RIFF$N\000\000WAVEfmt \020\000\000\000\001\000\001\000\"V\000\000D\254\000\000\002\000\020\000data\000N\000\000\353\377P\000\025\000\356\377\026\000\376\377 \000\345\377\376\377\374\377\004\000\377\377&\000\t\000\342\377\267\377\320\377\336\377\361\377\020\000B\000.\000\035\000\343\377\305\377\026\000\037\000\023\000(\000 \000\364\377\317\377\317\377\240\377\360\3771\000(\000\n\000\321\377$\000]\000\206\000&\000v\377\267\377\327\377\032\000F\000\341\377\313\377\364\377M\000M\0003\000\037\000\347\377\377\377\025\000\340\377\234\377\253\377x\377\242\377\017\000;\000C\000\036\000\002\000J\000\254\000U\000\305\377\305\377\306\377\267\377\265\377\200\377\251\377P\000\256\000\361\000\261\000\"\000\356\377\266\377\244\377\203\377]\377\201\377\304\377\007\000I\000\247\000\271\000\234\000=\000\366\377\212\377u\377\260\377\303\377\370\377\347\377\256\377\353\377L\000s\000\253\000\207\000\024\000\325\377\240\377`\377c\377\2