<a href="https://colab.research.google.com/github/meridyian/2D-Field-of-View/blob/master/VoiceCloningandTTS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# install required github repos
!pip install git+https://github.com/suno-ai/bark.git
!git clone https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer
!pip install -r ./bark-voice-cloning-HuBERT-quantizer/requirements.txt

In [2]:
import sys
sys.path.append('./bark-voice-cloning-HuBERT-quantizer')
import os
from scipy.io.wavfile import write as write_wav
import numpy as np
import torch
import torchaudio
from bark.api import generate_audio
from bark.generation import SAMPLE_RATE, preload_models, load_codec_model
from encodec.utils import convert_audio
from bark_hubert_quantizer.customtokenizer import CustomTokenizer
from bark_hubert_quantizer.hubert_manager import HuBERTManager
from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert

In [3]:
preload_models(
     text_use_gpu = False,
     text_use_small = False,
     coarse_use_gpu = False,
     coarse_use_small = False,
     fine_use_gpu = False,
     fine_use_small = False,
     codec_use_gpu = False,
     force_reload = False
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = load_codec_model(use_gpu = True if device == 'cuda' else  False)
hubert_manager = HuBERTManager()
hubert_manager.make_sure_hubert_installed()
hubert_manager.make_sure_tokenizer_installed()

hubert_model = CustomHubert(checkpoint_path = 'data/models/hubert/hubert.pt').to(device)
tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth', map_location =device).to(device)

In [4]:

  text_prompt = "ne türkçesi, böyle anlarsınız kızlar, kuyruk onunla böyle merdivende yukardan böyle inmek istiyorum aradım dedim böyle böyle kadrajı böyle böyle görüyorum "
audio_filepath ='/content/Karsu.wav'

if not os.path.isfile(audio_filepath):
  raise ValueError(f'Audio file not exist {audio_filepath}')

wav, sr = torchaudio.load(audio_filepath)
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
wav = wav.to(device)

semantic_vectors = hubert_model.forward(wav, input_sample_hz = model.sample_rate)
semantic_tokens = tokenizer.get_token(semantic_vectors)

with torch.no_grad():
  encoded_frames = model.encode(wav.unsqueeze(0))
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim =-1).squeeze()
codes =codes.cpu().numpy()

semantic_tokens = semantic_tokens.cpu().numpy()
voice_filename = "output.npz"
current_path = '/content/output'
voice_name = os.path.join(current_path, voice_filename)
np.savez(voice_name, fine_prompt =codes, coarse_prompt =codes[:2, :],semantic_prompt = semantic_tokens)
audio_array = generate_audio(text_prompt, history_prompt = voice_name, text_temp  =0.7, waveform_temp = 0.7)
filepath = '/content/output/out.wav'
write_wav(filepath, SAMPLE_RATE, audio_array)

100%|██████████| 698/698 [02:04<00:00,  5.59it/s]
100%|██████████| 35/35 [11:35<00:00, 19.88s/it]


In [5]:
from bark.api import generate_audio
from transformers import BertTokenizer
from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic

text_prompt = "Zeytinleri Güler marketten, etleri Gökhan kasaptan alabilirsiniz"
preload_models(
     text_use_gpu = True,
     text_use_small = False,
     coarse_use_gpu = True,
     coarse_use_small = False,
     fine_use_gpu = True,
     fine_use_small = False,
     codec_use_gpu = True,
     force_reload = False
)

In [6]:
audio_array = generate_audio(text_prompt, history_prompt = voice_name, text_temp = 0.7, waveform_temp = 0.7)

x_semantic = generate_text_semantic(text_prompt,
                                    history_prompt = voice_name,
                                    temp = 0.7,
                                    top_k = 50,
                                    top_p = 0.95,
                                    )
x_coarse_gen = generate_coarse(x_semantic,
                                    history_prompt = voice_name,
                                    temp = 0.7,
                                    top_k = 50,
                                    top_p = 0.95,
                                    )
x_fine_gen = generate_fine(x_coarse_gen,
                                    history_prompt = voice_name,
                                    temp = 0.5,
                                    )
audio_array = codec_decode(x_fine_gen)

100%|██████████| 644/644 [00:08<00:00, 71.58it/s]
100%|██████████| 33/33 [00:32<00:00,  1.01it/s]
100%|██████████| 274/274 [00:27<00:00, 10.14it/s]
100%|██████████| 14/14 [02:54<00:00, 12.45s/it]


In [7]:
from IPython.display import Audio
Audio(audio_array, rate = SAMPLE_RATE)

In [19]:
from scipy.io.wavfile import write as write_wav
filepath = "/content/output/audio.wav"
write_wav(filepath, SAMPLE_RATE, audio_array)