## Load Dataset

In [1]:
# load Trump dataset
from datasets import load_dataset
trump_dataset = load_dataset("tuenguyen/trump-speech-dataset-tts")

In [2]:
# listen to sample
from IPython.display import Audio
trump_sample = trump_dataset["train"][0]
audio_array = trump_sample["path"]["array"]
transcript = trump_sample["transcript"]
sr = trump_sample["path"]["sampling_rate"]
audio_data = Audio(audio_array, rate=sr)

In [3]:
# write the audio data to a wav file
with open("trump_sample.wav", "wb") as opt_file:
    opt_file.write(audio_data.data)

In [4]:
sample_text = '''
Marijuana legalization presents an opportunity to promote public health, reduce crime, and generate tax revenue. 
Evidence suggests that regulation can ensure safer consumption, reduce the burden on the criminal justice system, and direct law enforcement resources toward more serious crimes. 
Additionally, the economic benefits of legal cannabis industries, including job creation and tax revenue, have been demonstrated in states where it is already legal. 
By focusing on regulation over prohibition, we can ensure a more effective and balanced approach to marijuana use and its societal impact.
'''

## Tortoise

In [1]:
import torch
from TTS.api import TTS

# get device
device = "cuda" if torch.cuda.is_available() else "cpu"
#initialize TTS
tortoise = TTS("tts_models/en/multi-dataset/tortoise-v2").to(device)

GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [24]:
voice_samples = [torch.tensor(audio_array)]
print(voice_samples)

[tensor([-0.0449, -0.0230,  0.1235,  ...,  0.2088,  0.2020,  0.1975],
       dtype=torch.float64)]


In [25]:
tortoise.tts_to_file(
    sample_text,
    file_path="trump_output_tortoise.wav",
    voice_samples=voice_samples
)

TypeError: TTS.tts.models.tortoise.Tortoise.inference_with_config() got multiple values for keyword argument 'voice_samples'

## Bark

In [11]:
import torch
from TTS.api import TTS

# get device
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/multilingual/multi-dataset/bark").to(device)

  WeightNorm.apply(module, name, dim)


In [13]:
tts.tts_to_file(
    sample_text,
    speaker_wav=["trump_sample.wav"],
    file_path="trump_output_bark.wav"
)

  7%|▋         | 7/100 [00:02<00:25,  3.68it/s]

KeyboardInterrupt: 

## XTTS

In [6]:
from TTS.api import TTS

device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

In [7]:
tts.tts_to_file(
    sample_text, 
    speaker_wav=["trump_sample.wav"],
    language="en",
    file_path="trump_output_xtts.wav"
)

'trump_output_xtts.wav'

## VITS 

In [8]:
from TTS.api import TTS
device = "cuda" if torch.cuda.is_available() else "cpu"
vits = TTS("tts_models/en/ljspeech/vits").to(device)

In [10]:
vits.tts_to_file(
    sample_text,
    speaker_wav=["trump_sample.wav"],
    file_path="trump_output_vits.wav"
)

'trump_output_vits.wav'

## Vevo

In [5]:
import os
from huggingface_hub import snapshot_download

from models.vc.vevo.vevo_utils import *

def vevo_tts(
    src_text,
    ref_wav_path,
    timbre_ref_wav_path=None,
    output_path=None,
    ref_text=None,
    src_language="en",
    ref_language="en"
):
    if timbre_ref_wav_path is None:
        timbre_ref_wav_path = ref_wav_path
    
    gen_audio = inference_pipeline.inference_ar_and_fm(
        src_wav_path=None,
        src_text=src_text,
        style_ref_wav_path=ref_wav_path,
        timbre_ref_wav_path=timbre_ref_wav_path,
        style_ref_wav_text=ref_text,
        src_text_language=src_language,
        style_ref_wav_text_language=ref_language,
    )
    
    assert output_path is not None
    save_audio(gen_audio, output_path=output_path)

ModuleNotFoundError: No module named 'models'

In [6]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [7]:
## content style tokenizer
local_dir = snapshot_download(
    repo_id="amphion/Vevo",
    repo_type="model",
    cache_dir="./ckpts/Vevo",
    allow_patterns=["tokenizer/vq8192/*"]
)

content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
## autoregressive transformer
local_dir = snapshot_download(
    repo_id="amphion/Vevo",
    repo_type="model",
    cache_dir="./ckpts/Vevo",
    allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
)

ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
## flow matching transformer
local_dir = snapshot_download(
    repo_id="amphion/Vevo",
    repo_type="model",
    cache_dir="./ckpts/Vevo",
    allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
)

fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
## vocoder
local_dir = snapshot_download(
    repo_id="amphion/Vevo",
    repo_type="model",
    cache_dir="./ckpts/Vevo",
    allow_patterns=["acoustic_modeling/Vocoder/*"],
)

vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
inference_pipeline = VevoInferencePipeline(
    content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
    ar_cfg_path=ar_cfg_path,
    ar_ckpt_path=ar_ckpt_path,
    fmt_cfg_path=fmt_cfg_path,
    fmt_ckpt_path=fmt_ckpt_path,
    vocoder_cfg_path=vocoder_cfg_path,
    vocoder_ckpt_path=vocoder_ckpt_path,
    device=device,
)

#Params of AR model: 743.30 M
#Params of Flow Matching model: 337.69 M
#Params of Vocoder model: 255.04 M


  WeightNorm.apply(module, name, dim)


#Params of Content-Style Tokenizer: 44.29 M


In [15]:
### Zero-shot TTS (sample style and timbre reference)
vevo_tts(
    sample_text,
    "trump_sample.wav",
    output_path="trump_output_vevo.wav",
    ref_text=transcript,
    src_language="en",
    ref_language="en"
)

ImportError: cannot import name 'setLangfilters' from 'LangSegment.LangSegment' (/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/LangSegment/LangSegment.py)