In [None]:
from bark.api import generate_audio
from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic

In [None]:
semantic_path = "E:/Python/bark-with-voice-clone/semantic_output/pytorch_model.bin"
coarse_path = "E:/Python/bark-with-voice-clone/coarse_output/pytorch_model.bin"
fine_path = "E:/Python/bark-with-voice-clone/fine_output/pytorch_model.bin"

In [None]:
preload_models(
    text_use_gpu=True,
    text_use_small=False,
    text_model_path=semantic_path,
    coarse_use_gpu=True,
    coarse_use_small=False,
    coarse_model_path=coarse_path,
    fine_use_gpu=True,
    fine_use_small=False,
    fine_model_path=fine_path,
    codec_use_gpu=True,
    force_reload=False,
    path="models"
)

In [None]:
# simple generation
text_prompt = "I am Joe Biden... and this is the finetuned semantic, coarse and fine model! [laughs] A lot better than the original!"
audio_array = generate_audio(text_prompt, history_prompt=None, text_temp=0.7, waveform_temp=0.7)

In [None]:
from IPython.display import Audio
# play audio
Audio(audio_array, rate=SAMPLE_RATE)

In [None]:
from scipy.io.wavfile import write as write_wav
# save audio
filepath = "output/audio.wav" # change this to your desired output path
write_wav(filepath, SAMPLE_RATE, audio_array)

In [None]:
def generate_with_settings(text_prompt, semantic_temp=0.7, semantic_top_k=50, semantic_top_p=0.95, coarse_temp=0.7, coarse_top_k=50, coarse_top_p=0.95, fine_temp=0.5, voice_name=None, use_semantic_history_prompt=True, use_coarse_history_prompt=True, use_fine_history_prompt=True, output_full=False):
    # generation with more control
    x_semantic = generate_text_semantic(
        text_prompt,
        history_prompt=voice_name if use_semantic_history_prompt else None,
        temp=semantic_temp,
        top_k=semantic_top_k,
        top_p=semantic_top_p,
    )

    x_coarse_gen = generate_coarse(
        x_semantic,
        history_prompt=voice_name if use_coarse_history_prompt else None,
        temp=coarse_temp,
        top_k=coarse_top_k,
        top_p=coarse_top_p,
    )
    x_fine_gen = generate_fine(
        x_coarse_gen,
        history_prompt=voice_name if use_fine_history_prompt else None,
        temp=fine_temp,
    )

    if output_full:
        full_generation = {
            'semantic_prompt': x_semantic,
            'coarse_prompt': x_coarse_gen,
            'fine_prompt': x_fine_gen,
        }
        return full_generation, codec_decode(x_fine_gen)
    return codec_decode(x_fine_gen)

In [None]:
text_prompt = "I am Joe Biden... and this is the finetuned semantic, coarse and fine model! [laughs] A lot better than the original!"

audio_array = generate_with_settings(
    text_prompt,
    semantic_temp=0.7,
    semantic_top_k=50,
    semantic_top_p=0.99,
    coarse_temp=0.7,
    coarse_top_k=50,
    coarse_top_p=0.99,
    fine_temp=0.5,
    voice_name=None,
    use_semantic_history_prompt=True,
    use_coarse_history_prompt=True,
    use_fine_history_prompt=True,
    output_full=False
)

from IPython.display import Audio
# play audio
Audio(audio_array, rate=SAMPLE_RATE)

In [None]:
from scipy.io.wavfile import write as write_wav
# save audio
filepath = "output/audio.wav" # change this to your desired output path
write_wav(filepath, SAMPLE_RATE, audio_array)