In [3]:
from bark.generation import load_codec_model, generate_text_semantic
from encodec.utils import convert_audio

import torchaudio
import torch

device = 'cpu' # or 'cpu'
model = load_codec_model(use_gpu=True if device == 'cuda' else False)

In [4]:
# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
from hubert.hubert_manager import HuBERTManager
hubert_manager = HuBERTManager()
hubert_manager.make_sure_hubert_installed()
hubert_manager.make_sure_tokenizer_installed()

'data\\models\\hubert\\tokenizer.pth'

In [5]:
# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer 
# Load HuBERT for semantic tokens
from hubert.pre_kmeans_hubert import CustomHubert
from hubert.customtokenizer import CustomTokenizer

# Load the HuBERT model
hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt').to(device)

# Load the CustomTokenizer model
tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth').to(device)  # Automatically uses the right layers

In [6]:
# Load and pre-process the audio waveform
audio_filepath = 'output.wav' # the audio you want to clone (under 13 seconds)
wav, sr = torchaudio.load(audio_filepath)
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
wav = wav.to(device)

In [7]:
semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)
semantic_tokens = tokenizer.get_token(semantic_vectors)

In [8]:
# Extract discrete codes from EnCodec
with torch.no_grad():
    encoded_frames = model.encode(wav.unsqueeze(0))
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()  # [n_q, T]

In [9]:
# move codes to cpu
codes = codes.cpu().numpy()
# move semantic tokens to cpu
semantic_tokens = semantic_tokens.cpu().numpy()

In [10]:
import numpy as np
voice_name = 'output' # whatever you want the name of the voice to be
output_path = 'bark/assets/prompts/' + voice_name + '.npz'
np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)

In [11]:
# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'

In [12]:
# Heres the generation stuff copy-pasted for convenience

In [13]:
from bark.api import generate_audio
from transformers import BertTokenizer
from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic

# Enter your prompt and speaker here
text_prompt = "Hi there, everyone! Today we are going to talk about why you should think twice before eating burgers. Burgers might seem yummy and fun to eat, but they can be really bad for your health. Let's find out why..First, burgers are often made with lots of fatty meats. These fats, called saturated fats, can make your heart unhealthy. When you eat too many burgers, you can start to have problems with your heart. It’s like putting bad fuel into a car—the car won't run well and it might even break down. Your heart is the same way; it needs good fuel, like fruits and veggies, to work properly..Second, burgers usually come with lots of extra stuff that isn't good for you. Think about the cheese, bacon, and sauces that are often added. These toppings can have a lot of salt. Eating too much salt can make your blood pressure go up, which is not good for your body. It's like adding too much air to a balloon; one day, it might just pop..Another thing to consider is that burgers can make you gain weight. They usually have a lot of calories because of the fatty meat and other ingredients. Eating too many calories can add extra weight to your body, making it harder to run and play. This can also lead to other serious health issues like diabetes..It’s also important to remember that many burgers are made quickly and might not be fresh. They can have harmful bacteria that make you sick. You don’t want to end up spending all your fun time in bed with a stomach ache!.Instead of burgers, try eating foods that are good for you and taste great too. Make a homemade sandwich with fresh veggies, lean meat, and whole grain bread. Your body will thank you for it!.So, next time you think about grabbing a burger, remember these tips and make a better choice for your health. Stay happy and stay healthy!"
voice_name = "output" # use your custom voice name here if you have one

In [12]:
# Devide text prompts into short sentences.

text_prompts = text_prompt.split('.')
text_prompts



['Hi there, everyone! Today we are going to talk about why you should think twice before eating burgers',
 ' Burgers might seem yummy and fun to eat, but they can be really bad for your health',
 " Let's find out why",
 '',
 'First, burgers are often made with lots of fatty meats',
 ' These fats, called saturated fats, can make your heart unhealthy',
 ' When you eat too many burgers, you can start to have problems with your heart',
 " It’s like putting bad fuel into a car—the car won't run well and it might even break down",
 ' Your heart is the same way; it needs good fuel, like fruits and veggies, to work properly',
 '',
 "Second, burgers usually come with lots of extra stuff that isn't good for you",
 ' Think about the cheese, bacon, and sauces that are often added',
 ' These toppings can have a lot of salt',
 ' Eating too much salt can make your blood pressure go up, which is not good for your body',
 " It's like adding too much air to a balloon; one day, it might just pop",
 '',
 

In [15]:
# download and load all models
preload_models(
    text_use_gpu=True,
    text_use_small=False,
    coarse_use_gpu=True,
    coarse_use_small=False,
    fine_use_gpu=True,
    fine_use_small=False,
    codec_use_gpu=True,
    force_reload=False,
    path="models"
)

In [14]:
# simple generation
# audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)
# from IPython.display import Audio
# # play audio
# Audio(audio_array, rate=SAMPLE_RATE)

# save audio


100%|██████████| 100/100 [00:53<00:00,  1.85it/s]
100%|██████████| 35/35 [05:57<00:00, 10.21s/it]


In [15]:
from scipy.io.wavfile import write as write_wav
audios = []
for i, text in enumerate(text_prompts):
    text = text + '.'
    audio = generate_audio(text, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)
    audios.append(audio)
    write_wav('audio' + str(i) + '.wav', SAMPLE_RATE, audio)

100%|██████████| 100/100 [00:34<00:00,  2.91it/s]
100%|██████████| 24/24 [04:00<00:00, 10.04s/it]
100%|██████████| 100/100 [00:23<00:00,  4.23it/s]
100%|██████████| 17/17 [02:49<00:00,  9.96s/it]
100%|██████████| 100/100 [00:11<00:00,  8.42it/s]
100%|██████████| 9/9 [01:28<00:00,  9.80s/it]
100%|██████████| 100/100 [00:04<00:00, 21.11it/s]
100%|██████████| 3/3 [00:27<00:00,  9.25s/it]
100%|██████████| 100/100 [00:31<00:00,  3.14it/s]
100%|██████████| 23/23 [03:52<00:00, 10.09s/it]
100%|██████████| 100/100 [00:54<00:00,  1.85it/s]
100%|██████████| 35/35 [06:05<00:00, 10.43s/it]
100%|██████████| 100/100 [00:16<00:00,  6.24it/s]
100%|██████████| 12/12 [01:57<00:00,  9.80s/it]
100%|██████████| 100/100 [00:39<00:00,  2.53it/s]
100%|██████████| 28/28 [05:05<00:00, 10.91s/it]
100%|██████████| 100/100 [00:35<00:00,  2.84it/s]
100%|██████████| 25/25 [04:08<00:00,  9.93s/it]
100%|██████████| 100/100 [00:05<00:00, 17.24it/s]
100%|██████████| 3/3 [00:30<00:00, 10.13s/it]
100%|██████████| 100/100 [

In [None]:
# generation with more control
x_semantic = generate_text_semantic(
    text_prompt,
    history_prompt=voice_name,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)

x_coarse_gen = generate_coarse(
    x_semantic,
    history_prompt=voice_name,
    temp=0.7,
    top_k=50,
    top_p=0.95,
)
x_fine_gen = generate_fine(
    x_coarse_gen,
    history_prompt=voice_name,
    temp=0.5,
)
audio_array = codec_decode(x_fine_gen)

100%|██████████| 100/100 [43:17<00:00, 25.97s/it]
 11%|█▏        | 4/35 [23:06<2:59:18, 347.04s/it]

In [None]:
from IPython.display import Audio
# play audio
Audio(audio_array, rate=SAMPLE_RATE)

In [None]:
from scipy.io.wavfile import write as write_wav
# save audio
filepath = "/output/res.wav" # change this to your desired output path
write_wav(filepath, SAMPLE_RATE, audio_array)