In [1]:
import requests

headers = {
    'accept': 'application/json',
}

params = {
    "text": "Text to Speech with voice cloning, only use text normalization natively provided by the model",
    "file_response": "false",
    "response_format": "mp3"
}

files = {
    'audio_input': ('audio.mp3', open('../stress-test/seedtts_ref_en_1.wav', 'rb'), 'audio/mpeg'),
}

response = requests.post('http://localhost:7088/tts', headers=headers, files=files, params=params)

In [2]:
import base64
import io
import soundfile as sf

In [3]:
r = response.json()
r.keys(), r['stats']

(dict_keys(['audio', 'stats']),
 {'total_length': 9.685333333333332, 'seconds_per_second': 2.798833315684127})

In [4]:
audio_binary = base64.b64decode(r['audio'])
buffer = io.BytesIO(audio_binary)
audio_array, sample_rate = sf.read(buffer)
len(audio_array) / sample_rate, sample_rate

(9.685333333333332, 24000)

In [5]:
import IPython.display as ipd

ipd.Audio(audio_array, rate = sample_rate)

In [6]:
import aiohttp

async def process(file_path, text):
    url = "http://localhost:7088/tts"
    
    async with aiohttp.ClientSession() as session:
        with open(file_path, "rb") as file:
            form_data = aiohttp.FormData()
            form_data.add_field(
                "audio_input",
                file,
                filename="audio.mp3",
                content_type="audio/mpeg"
            )
            
            headers = {
                "accept": "application/json"
            }
            
            params = {
                "text": text,
                "file_response": "false",
                "response_format": "mp3"
            }
            
            async with session.post(url, headers=headers, data=form_data, params=params) as response:
                if response.status == 200:
                    data = await response.json()
                    return data
                else:
                    print(f"Failed with status: {response.status}")
                    print("Response:", await response.text())

In [7]:
from glob import glob
import asyncio

files = ['../stress-test/seedtts_ref_en_3.wav', '../stress-test/seedtts_ref_en_3.wav']
texts = [
    "Text to Speech with voice cloning, only use text normalization natively provided by the model.",
    "Your safety and the pack's reputation are at stake. Your bravery is admirable, but sometimes bravery is knowing when to retreat."
]
combined = list(zip(files, texts))
combined

[('../stress-test/seedtts_ref_en_3.wav',
  'Text to Speech with voice cloning, only use text normalization natively provided by the model.'),
 ('../stress-test/seedtts_ref_en_3.wav',
  "Your safety and the pack's reputation are at stake. Your bravery is admirable, but sometimes bravery is knowing when to retreat.")]

In [8]:
futures = [process(*c) for c in combined]
results = await asyncio.gather(*futures)

In [9]:
results[0]['stats']

{'total_length': 6.346666666666667, 'seconds_per_second': 0.20082433034979516}

In [10]:
results[1]['stats']

{'total_length': 8.629333333333333, 'seconds_per_second': 0.272747353639242}

In [11]:
audio_binary = base64.b64decode(results[0]['audio'])
buffer = io.BytesIO(audio_binary)
audio_array, sample_rate = sf.read(buffer)
print(len(audio_array) / sample_rate, sample_rate)
ipd.Audio(audio_array, rate = sample_rate)

6.346666666666667 24000


In [12]:
audio_binary = base64.b64decode(results[1]['audio'])
buffer = io.BytesIO(audio_binary)
audio_array, sample_rate = sf.read(buffer)
print(len(audio_array) / sample_rate, sample_rate)
ipd.Audio(audio_array, rate = sample_rate)

8.629333333333333 24000
