In [2]:
import os
import json

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

%pip install nltk
%pip install git+https://github.com/suno-ai/bark.git

from IPython.display import Audio
import nltk  # we'll use this to split into sentences
import numpy as np

from bark.generation import (
    generate_text_semantic,
    preload_models,
)
from bark.api import semantic_to_waveform
from bark import generate_audio, SAMPLE_RATE

Note: you may need to restart the kernel to use updated packages.
Collecting git+https://github.com/suno-ai/bark.git
  Cloning https://github.com/suno-ai/bark.git to /tmp/pip-req-build-yobod_vn
  Running command git clone --filter=blob:none --quiet https://github.com/suno-ai/bark.git /tmp/pip-req-build-yobod_vn
  Resolved https://github.com/suno-ai/bark.git to commit 1ad007171e0c46078eb6d3afb6db4daf0c4f41cd
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [3]:
preload_models()

In [4]:
import sys
print("Python version")
print (sys.version)
print("Version info.")
print (sys.version_info)

# Get the path of the active environment
env_path = os.environ['CONDA_PREFIX']

# Get the name of the active environment from the path
env_name = os.path.basename(env_path)

# Print the name of the active environment
print("Active conda environment:", env_name)

# Print the version of Python that is currently running
print("Python version:", sys.version) 


Python version
3.10.9 (main, Apr 26 2023, 19:01:06) [GCC 11.3.0]
Version info.
sys.version_info(major=3, minor=10, micro=9, releaselevel='final', serial=0)


KeyError: 'CONDA_PREFIX'

In [5]:
# Simple Long-Form Generation
# We split longer text into sentences using `nltk` and generate the sentences one by one.

In [6]:
script = """
Player Two is great, actually amazing. But do you know what's greater? 
You.
""".replace("\n", " ").strip()

In [7]:
nltk.download('punkt')
sentences = nltk.sent_tokenize(script)

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
SPEAKER = "v2/en_speaker_1"
silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence
print('sample rate', SAMPLE_RATE)
pieces = []
piecesJSON = []
for sentence in sentences:
    audio_array = generate_audio(sentence, history_prompt=SPEAKER)
    pieces += [audio_array, silence.copy()]
    piecesJSON += [audio_array.tolist(), silence.copy().tolist()]
    print('done', sentence, pieces)
    
with open('readme.txt', 'w') as f:
    json.dump(piecesJSON, f)


In [None]:
Audio(np.concatenate(pieces), rate=SAMPLE_RATE)

# $ \\ $

# Advanced Long-Form Generation
Somtimes Bark will hallucinate a little extra audio at the end of the prompt.
We can solve this issue by lowering the threshold for bark to stop generating text. 
We use the `min_eos_p` kwarg in `generate_text_semantic`

In [8]:
GEN_TEMP = 0.6
SPEAKER = "v2/en_speaker_9"
silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence

pieces = []
for sentence in sentences:
    semantic_tokens = generate_text_semantic(
        sentence,
        history_prompt=SPEAKER,
        temp=GEN_TEMP,
        min_eos_p=0.05,  # this controls how likely the generation is to end
    )

    audio_array = semantic_to_waveform(semantic_tokens, history_prompt=SPEAKER,)
    pieces += [audio_array, silence.copy()]



100%|██████████| 100/100 [00:01<00:00, 54.21it/s] 
100%|██████████| 11/11 [00:05<00:00,  1.84it/s]
100%|██████████| 100/100 [00:01<00:00, 89.59it/s]
100%|██████████| 7/7 [00:03<00:00,  1.95it/s]
100%|██████████| 100/100 [00:00<00:00, 120.51it/s]
100%|██████████| 5/5 [00:02<00:00,  1.89it/s]


In [9]:
Audio(np.concatenate(pieces), rate=SAMPLE_RATE)

# $ \\ $

# Make a Long-Form Dialog with Bark

### Step 1: Format a script and speaker lookup

In [None]:
speaker_lookup = {"Samantha": "v2/en_speaker_9", "John": "v2/en_speaker_2"}

# Script generated by chat GPT
script = """
Samantha: Hey, have you heard about this new text-to-audio model called "Bark"?

John: No, I haven't. What's so special about it?

Samantha: Well, apparently it's the most realistic and natural-sounding text-to-audio model out there right now. People are saying it sounds just like a real person speaking.

John: Wow, that sounds amazing. How does it work?

Samantha: I think it uses advanced machine learning algorithms to analyze and understand the nuances of human speech, and then replicates those nuances in its own speech output.

John: That's pretty impressive. Do you think it could be used for things like audiobooks or podcasts?

Samantha: Definitely! In fact, I heard that some publishers are already starting to use Bark to create audiobooks. And I bet it would be great for podcasts too.

John: I can imagine. It would be like having your own personal voiceover artist.

Samantha: Exactly! I think Bark is going to be a game-changer in the world of text-to-audio technology."""
script = script.strip().split("\n")
script = [s.strip() for s in script if s]
script

### Step 2: Generate the audio for every speaker turn

In [None]:
pieces = []
silence = np.zeros(int(0.5*SAMPLE_RATE))
for line in script:
    speaker, text = line.split(": ")
    audio_array = generate_audio(text, history_prompt=speaker_lookup[speaker], )
    pieces += [audio_array, silence.copy()]

### Step 3: Concatenate all of the audio and play it

In [None]:
Audio(np.concatenate(pieces), rate=SAMPLE_RATE)