## Load Dataset

In [3]:
# load Trump dataset
from datasets import load_dataset
trump_dataset = load_dataset("tuenguyen/trump-speech-dataset-tts")

In [4]:
# listen to sample
from IPython.display import Audio
trump_sample = trump_dataset["train"][0]
audio_array = trump_sample["path"]["array"]
sr = trump_sample["path"]["sampling_rate"]
audio_data = Audio(audio_array, rate=sr)

In [5]:
# write the audio data to a wav file
with open("trump_sample.wav", "wb") as opt_file:
    opt_file.write(audio_data.data)

In [6]:
sample_text = '''
Marijuana legalization presents an opportunity to promote public health, reduce crime, and generate tax revenue. 
Evidence suggests that regulation can ensure safer consumption, reduce the burden on the criminal justice system, and direct law enforcement resources toward more serious crimes. 
Additionally, the economic benefits of legal cannabis industries, including job creation and tax revenue, have been demonstrated in states where it is already legal. 
By focusing on regulation over prohibition, we can ensure a more effective and balanced approach to marijuana use and its societal impact.
'''

## Tortoise

In [1]:
import torch
from TTS.api import TTS

# get device
device = "cuda" if torch.cuda.is_available() else "cpu"
#initialize TTS
tortoise = TTS("tts_models/en/multi-dataset/tortoise-v2").to(device)

GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [24]:
voice_samples = [torch.tensor(audio_array)]
print(voice_samples)

[tensor([-0.0449, -0.0230,  0.1235,  ...,  0.2088,  0.2020,  0.1975],
       dtype=torch.float64)]


In [25]:
tortoise.tts_to_file(
    sample_text,
    file_path="trump_output_tortoise.wav",
    voice_samples=voice_samples
)

TypeError: TTS.tts.models.tortoise.Tortoise.inference_with_config() got multiple values for keyword argument 'voice_samples'

## Bark

In [None]:
import torch
from TTS.api import TTS

# get device
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/multilingual/multi-dataset/bark").to(device)
tts.tts_with_vc_to_file(
    sample_text,
    speaker_wav="trump_sample.wav",
    file_path="trump_output_bark.wav"
)

  WeightNorm.apply(module, name, dim)
100%|██████████| 100/100 [00:25<00:00,  3.89it/s]
100%|██████████| 32/32 [01:45<00:00,  3.31s/it]
100%|██████████| 100/100 [00:25<00:00,  3.88it/s]
100%|██████████| 33/33 [01:47<00:00,  3.27s/it]
100%|██████████| 100/100 [00:27<00:00,  3.62it/s]
100%|██████████| 36/36 [02:07<00:00,  3.55s/it]
100%|██████████| 100/100 [00:22<00:00,  4.45it/s]
100%|██████████| 30/30 [01:43<00:00,  3.44s/it]
Failed to deserialize field: test_sentences (list[str]) = [["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."], ['Be a voice, not an echo.'], ["I'm sorry Dave. I'm afraid I can't do that."], ["This cake is great. It's so delicious and moist."], ['Prior to November 22, 1963.']]
Replaced it with field's default value: []
  self.deserialize(data)


'trump_output.wav'

## XTTS

In [23]:
print(tts.models)

['tts_models/multilingual/multi-dataset/xtts_v2', 'tts_models/multilingual/multi-dataset/xtts_v1.1', 'tts_models/multilingual/multi-dataset/your_tts', 'tts_models/multilingual/multi-dataset/bark', 'tts_models/bg/cv/vits', 'tts_models/cs/cv/vits', 'tts_models/da/cv/vits', 'tts_models/et/cv/vits', 'tts_models/ga/cv/vits', 'tts_models/en/ek1/tacotron2', 'tts_models/en/ljspeech/tacotron2-DDC', 'tts_models/en/ljspeech/tacotron2-DDC_ph', 'tts_models/en/ljspeech/glow-tts', 'tts_models/en/ljspeech/speedy-speech', 'tts_models/en/ljspeech/tacotron2-DCA', 'tts_models/en/ljspeech/vits', 'tts_models/en/ljspeech/vits--neon', 'tts_models/en/ljspeech/fast_pitch', 'tts_models/en/ljspeech/overflow', 'tts_models/en/ljspeech/neural_hmm', 'tts_models/en/vctk/vits', 'tts_models/en/vctk/fast_pitch', 'tts_models/en/sam/tacotron-DDC', 'tts_models/en/blizzard2013/capacitron-t2-c50', 'tts_models/en/blizzard2013/capacitron-t2-c150_v2', 'tts_models/en/multi-dataset/tortoise-v2', 'tts_models/en/jenny/jenny', 'tts_m

In [13]:
from TTS.api import TTS

device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)


In [15]:
tts.tts_to_file(
    sample_text, 
    speaker_wav=["trump_sample.wav"],
    language="en",
    file_path="trump_output_xtts.wav"
)

'trump_output_xtts.wav'

## VITS 

In [19]:
from TTS.api import TTS
device = "cuda" if torch.cuda.is_available() else "cpu"
vits = TTS("tts_models/en/ljspeech/vits").to(device)

In [22]:
vits.tts_to_file(
    sample_text,
    speaker_wav="trump_sample.wav",
    file_path="trump_output_vits.wav"
)

'trump_output_vits.wav'