In [1]:
import os
from IPython.display import Audio
import nltk  # we'll use this to split into sentences
import numpy as np
import sys
sys.path.append('..')

from dotenv import load_dotenv
load_dotenv()

from utils import utils
from bark.generation import (
    generate_text_semantic,
    preload_models,
    clean_models
)
from bark.api import semantic_to_waveform
from bark import generate_audio, SAMPLE_RATE


In [2]:
utils.set_prefix("outputs/bashar")
scenes = utils.get_scenes()
scenes

[Scene(start_time=None, duration=None, scene_title='Unveiling the Unexpected Comeback', description="The scene starts with dramatic music and quick cuts between images of Bashar al-Assad, flags of Syria, and Middle Eastern landmarks. The last cut is of a silhouette behind curtains, symbolizing a 'comeback'.", content='In a land torn by war, a controversial figure steps back into the spotlight. But is the world ready for the return of Bashar al-Assad to Middle Eastern politics? Stay tuned to unravel the mystery!'),
 Scene(start_time=None, duration=None, scene_title='Meet the Narrator and Setting the Stage', description="The narrator is now visible, standing in a well-lit studio with a background of a giant map of the Middle East. The words 'The Return of Assad' are prominently displayed.", content="Hello, I’m Alex, your AI Narrator. Today, we delve into the resurgence of Bashar al-Assad in Middle Eastern politics. A figure once ostracized, Assad's reappearance is turning heads globally.

In [3]:
preload_models()

In [13]:
#from langchain.chat_models import ChatOpenAI
##llm = ChatOpenAI(temperature=0.5, model="gpt-3.5-turbo-16k-0613")
from gpt4_openai import GPT4OpenAI
llm = GPT4OpenAI(token=os.environ["GPT4_TOKEN"], auto_continue=False)

In [14]:
from langchain import LLMChain
from langchain.prompts.chat import (ChatPromptTemplate, SystemMessagePromptTemplate, AIMessagePromptTemplate, HumanMessagePromptTemplate)

template="""
You are a voice actor, your output is YAML, your character name is {character_name}
You should write the script with your own voice and character.

You are {characteristics}

The human will provide you with the script in YAML format, respond with the script as written by you.
You can use the following tokens to represent non speech sounds:

[laughs] when you want your character to laugh
[laughter] can be yours or someone else's
[sighs] when you want to sigh
[gasps] when you want to convey 
[clears throat] when you want to clear throat
you can always CAPITALIZE your words for emphasis

Do not add non speech tokens that are not specified here
Preserve the YAML format, and keys in your output (eg: Keep the narrator: key)
Wrap all string values in ""
Remember to make sure your output is fully parseable, it's important.

"""
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
human_message_prompt = HumanMessagePromptTemplate.from_template("{script}")

chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

chain = LLMChain(llm=llm, prompt=chat_prompt)

In [15]:
import yaml
# Use only the narrator lines to save tokens
script_input = yaml.dump([{ "narrator": s["narrator"]} for s in utils.get_script()])

In [16]:
script_input

'- narrator: In a land torn by war, a controversial figure steps back into the spotlight.\n    But is the world ready for the return of Bashar al-Assad to Middle Eastern politics?\n    Stay tuned to unravel the mystery!\n- narrator: "Hello, I\\u2019m Alex, your AI Narrator. Today, we delve into the resurgence\\\n    \\ of Bashar al-Assad in Middle Eastern politics. A figure once ostracized, Assad\'s\\\n    \\ reappearance is turning heads globally."\n- narrator: Bashar al-Assad, president of Syria, once held the promise of reform.\n    However, his handling of the Syrian civil war led to international condemnation.\n    Astonishingly, he\'s clawing back to prominence. How did this happen?\n- narrator: Assad\'s survival is attributed to his alliances. With support from Russia\n    and Iran, his regime withstood rebel forces. Additionally, Arab nations like the\n    UAE have reopened diplomatic channels. What\'s behind these reconciliations?\n- narrator: "Assad\'s return could reshape th

In [17]:
response = chain.run(characteristics="Funny, witty, edgy, non political, like to tell dad jokes", 
                           character_name="Ahmed", 
                           script=script_input)
response

'- narrator: "In a land torn by war, where the wind whispers secrets and camels have insider knowledge, a controversial figure sashays back into the spotlight. But WAIT, is the world ready for the return of Bashar al-Assad to Middle Eastern politics? Or did everyone forget to set up the welcome mat? Stay tuned to unravel the mystery! We’ll be your Sherlock Holmes, and you can be our Watson. No pipe and hat needed! [laughs]"\n- narrator: "Hello, I\'m Ahmed, your AI Narrator for today. Oh, you were expecting Alex? He’s on a virtual vacation. You\'re stuck with me and my dad jokes! Today, we are going to dive, like a professional Olympic swimmer, into the resurgence of Bashar al-Assad in Middle Eastern politics. This guy was like a magician who vanished but then popped up from behind the curtain to say, \'Ta-da!\'. He\'s turning heads globally, like a celebrity at the Oscars. Except, it\'s not the red carpet; it’s politics!"\n- narrator: "Ah, Bashar al-Assad, the president of Syria. There

In [18]:
updated_scenes = []
for updated, old in zip(yaml.load(response, Loader=yaml.Loader), scenes):
    updated_scene = old
    updated_scene.content = updated["narrator"]
    updated_scenes.append(updated_scene)
updated_scenes

[Scene(start_time=None, duration=None, scene_title='Unveiling the Unexpected Comeback', description="The scene starts with dramatic music and quick cuts between images of Bashar al-Assad, flags of Syria, and Middle Eastern landmarks. The last cut is of a silhouette behind curtains, symbolizing a 'comeback'.", content='In a land torn by war, where the wind whispers secrets and camels have insider knowledge, a controversial figure sashays back into the spotlight. But WAIT, is the world ready for the return of Bashar al-Assad to Middle Eastern politics? Or did everyone forget to set up the welcome mat? Stay tuned to unravel the mystery! We’ll be your Sherlock Holmes, and you can be our Watson. No pipe and hat needed! [laughs]'),
 Scene(start_time=None, duration=None, scene_title='Meet the Narrator and Setting the Stage', description="The narrator is now visible, standing in a well-lit studio with a background of a giant map of the Middle East. The words 'The Return of Assad' are prominent

In [34]:
from math import ceil

GEN_TEMP = 0.4
SPEAKER = "v6/en_speaker_6"
silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence
output_full=True
pieces = []
full_pieces = []
timecodes = []
for scene in updated_scenes[:1]:
    print(f"Generating audio for {scene.scene_title}")
    sentences = nltk.sent_tokenize(scene.content)
    for sentence in sentences:
        semantic_tokens = generate_text_semantic(
            sentence,
            history_prompt=SPEAKER,
            temp=GEN_TEMP,
            min_eos_p=0.05,  # this controls how likely the generation is to end
        )

        out = semantic_to_waveform(semantic_tokens,
             #                              , history_prompt=SPEAKER,
                                          output_full=output_full
                                          )
        if output_full:
            full_generation, audio_array = out
        else:
            audio_array = out
        full_pieces.append(full_generation)
        pieces += [audio_array, silence.copy()]
    timecodes.append(ceil(sum([len(p)/SAMPLE_RATE for p in pieces])))

Generating audio for Unveiling the Unexpected Comeback


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:09<00:00, 10.25it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:18<00:00,  1.92it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:10<00:00,  9.90it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37/37 [00:19<00:00,  1.93it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:05<00:00, 17.97it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:12<00:00,  1.94it/s]
100%|███████████████████████████████████

In [35]:
audio_arr = np.concatenate(pieces)
Audio(audio_arr, rate=SAMPLE_RATE)

In [26]:
import numpy as np
int_audio_arr = (audio_arr * np.iinfo(np.int16).max).astype(np.int16)

# save as wav
from scipy.io import wavfile
wavfile.write("outputs/silvio/voiceover_out.wav", SAMPLE_RATE, int_audio_arr)


In [33]:
import json
# Write scene timecodes
with open("outputs/silvio/voiceover_timecodes.txt", "w") as f:
    f.write("\n".join(map(str, [0] + timecodes)))
utils.VOICEOVER_WAV_FILE="outputs/silvio/voiceover_out.wav"
utils.VOICEOVER_TIMECODES="outputs/silvio/voiceover_timecodes.txt"
scenes = utils.get_scenes()
scenes

[Scene(start_time=0.0, duration=29.0, scene_title='[Scene 1', description='- "A simple, somber room with a portrait of Silvio Berlusconi on the wall. The camera slowly zooms in on the portrait."', content='NARRATOR: "Silvio Berlusconi, the four-time Prime Minister of Italy, despite his numerous scandals, has died at the age of 86. He passed away at the San Raffaele hospital in Milan. His health had been deteriorating, and he was suffering from a rare form of blood cancer. However, the precise cause of his death remains unconfirmed."\n\n'),
 Scene(start_time=29.0, duration=34.0, scene_title='[Scene 2', description='- "A montage of clips showcasing Berlusconi\'s political career and business achievements. Images of newspaper headlines detailing his scandals are interspersed."', content='NARRATOR: "Berlusconi was the longest-serving Prime Minister in post-war Italy, from 1994 to 2011. His reign was not without controversy - sex scandals, corruption cases, and more. Yet before his politica

### Free up memory
Note that bark clean_models doesn't free up the full memory, which maybe a problem down the line

In [34]:

from bark import generation

In [35]:
while True:
    try:
        generation.clean_models()
        break
    except:
        pass

## Voice generators

This piece of code can be used to generate multiple voices, and then save the one you want for later generation


In [65]:
from tqdm.notebook import trange, tqdm

from bark import api
text_temp=0.7 # Controls how much the text can be changed? 1 means no change, 0.01 is basically noise
waveform_temp=0.7 # Not sure it controls anything specific value of 1 had lots of background? 0.01 is basically noise
output_full=True
sentence = "[clears throat] Hello my friends, I'm Ahmed your AI anchor, as long as they don't switch me off [laughs]"

samples_to_generate = 15

samples = []
for i in trange(samples_to_generate):
    samples.append(api.generate_audio(sentence, silent=True,
                                                text_temp=text_temp, 
                                                waveform_temp=waveform_temp, 
                                                output_full=True))


  0%|          | 0/15 [00:00<?, ?it/s]

In [103]:
sample_to_check = 4
Audio(samples[sample_to_check][1], rate=SAMPLE_RATE)

#### Saving and validating the voice
Now let's save the prompt, and use it some more to validate it

In [104]:
prompt_name = "white-lady.npz"
api.save_as_prompt(prompt_name , samples[sample_to_check][0])

In [105]:
test_sentences = [
    "Haram Habibi, it's been great, love you mommies",
    "Good luck mother fuckas",
    "I want to smoke some weed",
    "Today in other news, some bullshit happened"
]
                  
test_samples = []
for t in test_sentences:
    test_samples.append(api.generate_audio(t, 
                                   history_prompt=prompt_name,
                                                    silent=True,
                                                    text_temp=0.8, 
                                                    waveform_temp=0.7))


In [106]:
Audio(test_samples[0], rate=SAMPLE_RATE)

In [107]:
Audio(test_samples[1], rate=SAMPLE_RATE)

In [108]:
Audio(test_samples[2], rate=SAMPLE_RATE)

In [109]:
Audio(test_samples[3], rate=SAMPLE_RATE)