In [None]:
"""
| Text-to-speech Models          | Voice Styles     |      Size | Note |
| ------------------------------ | ---------------- | --------- | ---- |
| kakao-enterprise/vits-ljs      | Female Voice     |    145 MB | English only, LJ_Speech dataset |

# TTS Datasets
- LJSpeech (13,100 English audio clips, 24 hours) https://huggingface.co/datasets/lj_speech
- MLS (Multilingual LibriSpeech, 8 languages, from LibriVox) https://huggingface.co/datasets/facebook/multilingual_librispeech
- VCTK (Voice Cloning Toolkit, 44 hours, 110 speakers) - https://huggingface.co/datasets/vctk
- Libri-TTS/LibriTTS-R (English only 585 hrs by Google) - https://huggingface.co/datasets/cdminix/libritts-r-aligned

available tasks in pipe():
['audio-classification', 'automatic-speech-recognition', 'conversational',
'depth-estimation', 'document-question-answering', 'feature-extraction', 'fill-mask', 'image-classification',
'image-segmentation', 'image-to-text', 'mask-generation', 'ner', 'object-detection', 'question-answering',
'sentiment-analysis', 'summarization', 'table-question-answering', 'text-classification', 'text-generation',
'text2text-generation', 'token-classification', 'translation', 'video-classification',
'visual-question-answering', 'vqa', 'zero-shot-audio-classification', 'zero-shot-classification',
'zero-shot-image-classification', 'zero-shot-object-detection', 'translation_XX_to_YY']

"""
try:
    import torch, transformers, datasets
except:
    %pip install transformers datasets torch

In [16]:
"""

Does not work in Mac as of April 2024, transformers does not implement VitsModel or text-to-speech task.
"""
import torch
from transformers import pipeline
from transformers import VitsModel, AutoTokenizer

text = """
Today is a new day, full of new possibilities.
Believe in yourself and your abilities, and remember that every accomplishment
starts with a single step. Embrace challenges as opportunities for growth,
and don't be afraid to try something new. You are capable of achieving greatness,
and every day is a fresh chance to make it happen. So rise up,
face the day with a positive attitude, and make today amazing!
"""


class MAI_TextToSpeech_Simple:
    model_id = "kakao-enterprise/vits-ljs"  # 24 hrs, 1 speaker, 145 MB

    # model_id = "kakao-enterprise/vits-vctk"  # 44 hrs, 109 speakers, 160 MB
    def __init__(self, model_id=None):
        if model_id:
            self.model_id = model_id
        self.use_pipe = True

    def tts_pipe(self, text):
        """Use simpler pipe()"""
        pipe = pipeline("text-to-speech")
        output = pipe(text)  # keys: ['audio'][0], ['sampling_rate']
        return output["audio"][0], output["sampling_rate"]

    def tts_vits(self, text):
        """Use VITS - Variational Inference with adversarial learning for end-to-end Text-to-Speech"""
        model = VitsModel.from_pretrained(self.model_id)
        print(model.config)
        tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        inputs = tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            output = model(**inputs)  # {'waveform', 'spectrogram'}

        return output.waveform, model.config.sampling_rate

    def generate_audio(self, text):
        if self.use_pipe:
            return self.tts_pipe(text)
        else:
            return self.tts_vits(text)


# Usage
tts = MAI_TextToSpeech_Simple()
waveform, sampling_rate = tts.generate_audio(text)

from IPython.display import Audio as IPythonAudio

IPythonAudio(waveform, rate=sampling_rate)

KeyError: "Unknown task text-to-speech, available tasks are ['audio-classification', 'automatic-speech-recognition', 'conversational', 'depth-estimation', 'document-question-answering', 'feature-extraction', 'fill-mask', 'image-classification', 'image-segmentation', 'image-to-text', 'mask-generation', 'ner', 'object-detection', 'question-answering', 'sentiment-analysis', 'summarization', 'table-question-answering', 'text-classification', 'text-generation', 'text2text-generation', 'token-classification', 'translation', 'video-classification', 'visual-question-answering', 'vqa', 'zero-shot-audio-classification', 'zero-shot-classification', 'zero-shot-image-classification', 'zero-shot-object-detection', 'translation_XX_to_YY']"

In [10]:
print(output)

tensor([[-0.0005, -0.0009, -0.0011,  ..., -0.0011, -0.0011, -0.0009]])


In [18]:
# Need some work.

import torch
import torchaudio

tts = gTTS(text, lang='en')
fp = tts.stream()
for s in fp:
    print(s)


b'\xff\xf3D\xc4\x00\x11\x90\x01\xe4\x01C\x00\x01\x0f\x0f\x0f\x1f\xff\xff\x98\x00\x00\x00\x00\x18xxxx\x00\x00\x00\x00\x18xy\xf3\xbb\xff\xf3?\xe3\x87\x80\x00\x00\x00\x01\x87\x87\x87\x87\x80\x00\x00\x00+\x1d\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xe2?\x00\x000\xf0\xf0\xf0\xf0\x00\x00\x00\x04a\xe3\xd5\xff1\x00d\x00\x01\xd9\x03\x82{\xd5,\xff\xf3D\xc4\x0c\x15\x1ar<\x01\x9b8\x00^\xf0\xf3\xb3\xff\x10~1\xb1\xc0\xba\x10 -\x7f\x12\xc9\x9e\xc5\x8fA\x9f\xe41Ab\xcc\xdf\xb7\xd1I\x93\x95\xfe\xd9\xfc\xa9\xc8\xd1\xff\xff\xa3\x1e~\xaeXx\x81\x88\\\xef\xff\xccm\x18\xfb\xddT\x85\xcec\x8fB\x7f\xfe\x7f\x81%\xc1\xa0\x89\xa2\x8aw\x0e\xccI\xbeh\x89bZ{\xff\xf3D\xc4\n\x14rf\xbc\x01\x89\x10\x00[)\x10\x8c4:\xbd\xf3\x862/\x9d\xf9\x0cT\x07\xeb\xb1\xcf0\xa4\xdd\x1f\xa19\x18\xe1\xca\xa8s\xb2\x91\xba\x1f\xe4i\x0c1a\x07\x12,\xd1\xea5\xcb\xff\xe434\xf65\x19\xc2\n8X\x08 \xf7\x7fZ\x0b\xa4\xa0!l\xb0\xd0)4*[\x8b\xab\n\xe0\xe8?\xdd\xeb/\xd9\xff\xf3D\xc4\x0b\x14\xba\x16\xa0\x01\xcf(\x007j\xefP\xfe\xbd~>r\xa8Diz\xb5\xba*bj\x1e\t