In [None]:
try:
    import transformers, datasets, accelerate, soundfile, speechbrain
except:
    print('Installing modules...')
#    %pip install -q transformers datasets accelerate soundfile speechbrain

"""
SpeechT5 uses speaker-embeddings that can output several voice types.

| Text-to-speech Models          | Voice Styles     |      Size | Note |
| ------------------------------ | ---------------- | --------- | ---- |
| microsoft/speecht5_tts         | Female Voice     |    145 MB | English only, LJ_Speech dataset |

# TTS Datasets
- LJSpeech (13,100 English audio clips, 24 hours) https://huggingface.co/datasets/lj_speech
- MLS (Multilingual LibriSpeech, 8 languages, from LibriVox) https://huggingface.co/datasets/facebook/multilingual_librispeech
- VCTK (Voice Cloning Toolkit, 44 hours, 110 speakers) - https://huggingface.co/datasets/vctk
- Libri-TTS/LibriTTS-R (English only 585 hrs by Google) - https://huggingface.co/datasets/cdminix/libritts-r-aligned

Reference:
- SpeechT5 - https://huggingface.co/blog/speecht5
- Fine-tuning SpeechT5 for multilingual TTS - https://colab.research.google.com/drive/1i7I5pzBcU3WDFarDnzweIj4-sVVoIUFJ
"""


In [1]:
text = """
Today is a new day, full of new possibilities. \
Believe in yourself and your abilities, and remember that every accomplishment \
starts with a single step. Embrace challenges as opportunities for growth, \
and don't be afraid to try something new. You are capable of achieving greatness, \
and every day is a fresh chance to make it happen. So rise up, \
face the day with a positive attitude, and make today amazing!
"""

import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
from transformers import SpeechT5HifiGan
from datasets import load_dataset
class TTS_t5():
    """Use SpeechT5, which is ~585 MB,
    
    The sample rate used by SpeechT5 is always 16 kHz.
    """

    model_id = "microsoft/speecht5_tts"

    def __init__(self):
        self.processor = SpeechT5Processor.from_pretrained(self.model_id)
        self.model = SpeechT5ForTextToSpeech.from_pretrained(self.model_id)

    def tts(self, text):
        inputs = self.processor(text=text, return_tensors="pt")

        # 19 voices
# Female voices: 7306, 7307, 7308, 7309, 7310, 7311, 7312, 7313, 7314, 7315
# Male voices: 7316, 7317, 7318, 7319, 7320, 7321, 7322, 7323, 7324, 7325
# cmu_arctic_slt_arctic_001 (female)
# cmu_arctic_slt_arctic_002 (female)
# cmu_arctic_slt_arctic_003 (female)
# cmu_arctic_slt_arctic_004 (female)
# cmu_arctic_slt_arctic_005 (female)
# cmu_arctic_slt_arctic_006 (female)
# cmu_arctic_slt_arctic_007 (female)
# cmu_arctic_slt_arctic_008 (female)
# cmu_arctic_slt_arctic_009 (female)
# cmu_arctic_slt_arctic_010 (female)
# cmu_arctic_slt_arctic_011 (male)
# cmu_arctic_slt_arctic_012 (male)
# cmu_arctic_slt_arctic_013 (male)
# cmu_arctic_slt_arctic_014 (male)
# cmu_arctic_slt_arctic_015 (male)
# cmu_arctic_slt_arctic_016 (male)
# cmu_arctic_slt_arctic_017 (male)
# cmu_arctic_slt_arctic_018 (male)
# cmu_arctic_slt_arctic_019 (male)

        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

        # speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
        speaker_embeddings = torch.tensor(embeddings_dataset[7315]["xvector"]).unsqueeze(0)

        spectrogram = self.model.generate_speech(inputs["input_ids"], speaker_embeddings)
        vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
        with torch.no_grad():
            speech = vocoder(spectrogram)


        # speech = self.model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
        return speech, 16000

    def match_voice(self):
        dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
        dataset = dataset.sort("id")
        example = dataset[40]

model_tts_t5 = TTS_t5()

waveform, sampling_rate = model_tts_t5.tts(text)

from IPython.display import Audio as IPythonAudio

IPythonAudio(waveform, rate=sampling_rate)

preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

FileNotFoundError: Unable to find 'hf://datasets/Matthijs/cmu-arctic-xvectors@36e87b347a6a70f0420445b02ec40c55556f9ed7/default/validation/0000.parquet' with any supported extension ['.csv', '.tsv', '.json', '.jsonl', '.parquet', '.geoparquet', '.gpq', '.arrow', '.txt', '.tar', '.blp', '.bmp', '.dib', '.bufr', '.cur', '.pcx', '.dcx', '.dds', '.ps', '.eps', '.fit', '.fits', '.fli', '.flc', '.ftc', '.ftu', '.gbr', '.gif', '.grib', '.h5', '.hdf', '.png', '.apng', '.jp2', '.j2k', '.jpc', '.jpf', '.jpx', '.j2c', '.icns', '.ico', '.im', '.iim', '.tif', '.tiff', '.jfif', '.jpe', '.jpg', '.jpeg', '.mpg', '.mpeg', '.msp', '.pcd', '.pxr', '.pbm', '.pgm', '.ppm', '.pnm', '.psd', '.bw', '.rgb', '.rgba', '.sgi', '.ras', '.tga', '.icb', '.vda', '.vst', '.webp', '.wmf', '.emf', '.xbm', '.xpm', '.BLP', '.BMP', '.DIB', '.BUFR', '.CUR', '.PCX', '.DCX', '.DDS', '.PS', '.EPS', '.FIT', '.FITS', '.FLI', '.FLC', '.FTC', '.FTU', '.GBR', '.GIF', '.GRIB', '.H5', '.HDF', '.PNG', '.APNG', '.JP2', '.J2K', '.JPC', '.JPF', '.JPX', '.J2C', '.ICNS', '.ICO', '.IM', '.IIM', '.TIF', '.TIFF', '.JFIF', '.JPE', '.JPG', '.JPEG', '.MPG', '.MPEG', '.MSP', '.PCD', '.PXR', '.PBM', '.PGM', '.PPM', '.PNM', '.PSD', '.BW', '.RGB', '.RGBA', '.SGI', '.RAS', '.TGA', '.ICB', '.VDA', '.VST', '.WEBP', '.WMF', '.EMF', '.XBM', '.XPM', '.aiff', '.au', '.avr', '.caf', '.flac', '.htk', '.svx', '.mat4', '.mat5', '.mpc2k', '.ogg', '.paf', '.pvf', '.raw', '.rf64', '.sd2', '.sds', '.ircam', '.voc', '.w64', '.wav', '.nist', '.wavex', '.wve', '.xi', '.mp3', '.opus', '.AIFF', '.AU', '.AVR', '.CAF', '.FLAC', '.HTK', '.SVX', '.MAT4', '.MAT5', '.MPC2K', '.OGG', '.PAF', '.PVF', '.RAW', '.RF64', '.SD2', '.SDS', '.IRCAM', '.VOC', '.W64', '.WAV', '.NIST', '.WAVEX', '.WVE', '.XI', '.MP3', '.OPUS', '.zip']

In [None]:
from speecht5.utils import get_voices

ModuleNotFoundError: No module named 'speecht5'