# install libraries

In [None]:
# IF GPU
!pip install piper-tts
!pip install onnxruntime-gpu

In [None]:
# If no GPU
!pip install piper-tts

# Create output Folder

In [None]:
!mkdir piper_output

# Import libraries

In [None]:
import time
import wave
from pathlib import Path
from typing import Any, Dict
import numpy as np
import torch

from piper import PiperVoice
from piper.download import ensure_voice_exists, find_voice, get_voices

In [None]:
model_list = {"en_GB-alan-low":[], "en_GB-alan-medium":[], "en_GB-northern_english_male-medium":[], "en_US-lessac-high":[]}
text = "This is sample text that will be convert to speech."

In [None]:
for iter in range(1):
    for model_name in model_list.keys():
        download_dir='drive/MyDrive/piper_models/'
        data_dir=['drive/MyDrive/piper_models/']

        speaker=None
        length_scale=None
        noise_scale=None
        noise_w=None
        sentence_silence=0.0
        if torch.cuda.is_available():
          cuda = True
        else:
          cuda=False
        update_voices=False
        debug=False

        model=f'{download_dir}/{model_name}.onnx'
        config=f'{download_dir}/{model_name}.onnx.json'
        if cuda:
            output_file_name = f"piper_output/gpu_{model_name}.onnx.wav"
        else:
            output_file_name = f"piper_output/cpu_{model_name}.onnx.wav"

        model_path = Path(model)

        if not model_path.exists():
            voices_info = get_voices(download_dir, update_voices=update_voices)
            aliases_info: Dict[str, Any] = {}
            for voice_info in voices_info.values():
                for voice_alias in voice_info.get("aliases", []):
                    aliases_info[voice_alias] = {"_is_alias": True, **voice_info}

            voices_info.update(aliases_info)
            ensure_voice_exists(model, data_dir, download_dir, voices_info)
            model, config = find_voice(model, data_dir)

        voice = PiperVoice.load(model, config_path=config, use_cuda=cuda)
        synthesize_args = {
            "speaker_id": speaker,
            "length_scale": length_scale,
            "noise_scale": noise_scale,
            "noise_w": noise_w,
            "sentence_silence": sentence_silence,
        }

        text = text.strip()
        if not text:
            raise ValueError("No text provided")

        start_time = time.time()
        with wave.open(output_file_name, "wb") as wav_file:
            voice.synthesize(text, wav_file, **synthesize_args)
        model_list[model_name].append(time.time() - start_time)


In [None]:
for model_name, time_dict in model_list.items():
    print(model_name, ":", np.round(np.mean(time_dict),4))