In [1]:
import os

In [2]:
os.chdir('../')

In [3]:
os.getcwd()

'/home/aryskin/ondewo/ondewo-t2s'

In [4]:
import io

In [5]:
import grpc
from ondewo_grpc.ondewo.t2s import text_to_speech_pb2, text_to_speech_pb2_grpc
import google.protobuf.empty_pb2 as empty_pb2
from google.protobuf.json_format import ParseDict, MessageToDict, MessageToJson

In [6]:
import soundfile as sf

In [7]:
import IPython.display as ipd

In [8]:
MAX_MESSAGE_LENGTH: int = 60000000
CAI_GRPC_HOST: str = "localhost"
CAI_GRPC_PORT: str = "50002"
CHANNEL: str = f"{CAI_GRPC_HOST}:{CAI_GRPC_PORT}"

options = [
    ('grpc.max_send_message_length', MAX_MESSAGE_LENGTH),
    ('grpc.max_receive_message_length', MAX_MESSAGE_LENGTH),
]


channel = grpc.insecure_channel(CHANNEL, options=options)
stab = text_to_speech_pb2_grpc.Text2SpeechStub(channel=channel)

In [9]:
pipeline1, pipeline2 = stab.ListT2sPipelines(request=empty_pb2.Empty()).pipelines

In [10]:
pipeline1

id: "glow_tts&hifi_gan-e976dd6c-2f41-484b-aec2-3e6868d37280"
description {
  language: "de"
  speaker_sex: "male"
  pipeline_owner: "ondewo"
  comments: "trained on public domain dataset"
}
active: true
inference {
  type: "composite"
  composite_inference {
    text2mel {
      type: "glow_tts"
      glow_tts {
        batch_size: 5
        use_gpu: true
        length_scale: 0.5
        noise_scale: 0.6669999957084656
        path: "models/glow-tts/de/thorsten_blank.pth"
        param_config_path: "models/glow-tts/de/config_blank.json"
      }
      glow_tts_triton {
        batch_size: 8
        length_scale: 1.0
        noise_scale: 0.6669999957084656
        max_text_length: 100
        param_config_path: "models/glow-tts/de/config.json"
        triton_url: "localhost:8001"
        triton_model_name: "glow_tts"
      }
    }
    mel2audio {
      type: "hifi_gan"
      waveglow_triton {
        param_config_path: "models/waveglow/waveglow.yaml"
        sigma: 0.6000000238418579
  

In [13]:
request = text_to_speech_pb2.SynthesizeRequest(text='Hallo, wie geht es dir Herr 1980 2020 3. April?', t2s_pipeline_id=pipeline1.id, length_scale = 1.0, pcm=0)
response = stab.Synthesize(request=request)

bio = io.BytesIO(response.audio)

audio = sf.read(bio, )

ipd.Audio(audio[0], rate=audio[1])

In [16]:
request = text_to_speech_pb2.SynthesizeRequest(text='Hi, how are you Mr. 29 fortyfive,  seventy eight?', t2s_pipeline_id=pipeline2.id, pcm=0)
response = stab.Synthesize(request=request)

print(response.audio_format, response.audio_length, response.generation_time, response.text)

bio = io.BytesIO(response.audio)

audio = sf.read(bio)

ipd.Audio(audio[0], rate=audio[1])

0 4.446621417999268 0.2350650280714035 Hi, how are you Mr. 29 fortyfive,  seventy eight?


In [13]:
request = text_to_speech_pb2.T2sPipelineId(id=pipeline1.id)
response = stab.GetT2sPipeline(request=request)

In [14]:
config1 = MessageToDict(response, including_default_value_fields=True,preserving_proto_field_name=True)

In [15]:
config2 = MessageToJson(response, including_default_value_fields=True,preserving_proto_field_name=True)

In [16]:
config1['inference']['composite_inference']['text2mel']['glow_tts']['length_scale'] = 0.5

In [17]:
conf = ParseDict(config1,text_to_speech_pb2.Text2SpeechConfig())

In [18]:
stab.UpdateT2sPipeline(request=conf)



In [19]:
config1

{'id': 'glow_tts&hifi_gan-e976dd6c-2f41-484b-aec2-3e6868d37280',
 'description': 'German, male, thorsten',
 'active': True,
 'inference': {'type': 'composite',
  'composite_inference': {'text2mel': {'type': 'glow_tts',
    'glow_tts': {'batch_size': '5',
     'use_gpu': True,
     'length_scale': 0.5,
     'noise_scale': 0.667,
     'path': 'models/glow-tts/de/thorsten_blank.pth',
     'param_config_path': 'models/glow-tts/de/config_blank.json',
     'cleaners': []},
    'glow_tts_triton': {'batch_size': '8',
     'length_scale': 1.0,
     'noise_scale': 0.667,
     'max_text_length': '100',
     'param_config_path': 'models/glow-tts/de/config.json',
     'triton_url': 'localhost:8001',
     'triton_model_name': 'glow_tts',
     'cleaners': []}},
   'mel2audio': {'type': 'hifi_gan',
    'waveglow_triton': {'param_config_path': 'models/waveglow/waveglow.yaml',
     'sigma': 0.6,
     'max_spect_size': '1000',
     'triton_model_name': 'waveglow',
     'triton_url': 'localhost:8001'},
  

In [19]:
type(response.inference.composite_inference.text2mel.glow_tts.batch_size)

int

In [20]:
type(conf.inference.composite_inference.text2mel.glow_tts.batch_size)

int

In [20]:
str(None)

'None'

In [25]:
audio

(array([ 0.00372314,  0.00427246,  0.00387573, ..., -0.00134277,
        -0.00161743, -0.0012207 ]),
 22050)

In [43]:
out = 'tmp/audio_file.caf'
sf.write(out, audio[0], samplerate=audio[1], format="caf", subtype='PCM_16')

In [44]:
import time

In [58]:
start_time = time.perf_counter()

In [59]:
end_time = time.perf_counter()

In [60]:
end_time - start_time

4.380929998000283