In [1]:
import sys
sys.path.append('..')
from audiocraft.models import MusicGen
from utils.utils import get_scenes
from utils import utils


In [2]:
# Using small model, better results would be obtained with `medium` or `large`.
model = MusicGen.get_pretrained('large')

In [52]:
utils.SCRIPT = "outputs/ted/scriptwriter_out.txt"
script = utils.get_script()
script

'[Scene 1] - "Aerial shot of the federal prison medical center in Butner, North Carolina. The sun is setting, casting long shadows across the prison complex. The mood is somber."\n\nNARRATOR: "Theodore \'Ted\' Kaczynski, the man infamously known as the Unabomber, passed away today in his cell at the federal prison medical center in Butner, North Carolina. He was 81 years old. The cause of his death, as of now, remains unclear."\n\n[Scene 2] - "Cut to archival footage of Kaczynski’s Montana shack. The camera zooms in on the remnants of the solitary building, surrounded by wilderness."\n\nNARRATOR: "Kaczynski, a Harvard-educated mathematician, chose seclusion in a remote Montana shack from where he orchestrated a 17-year bombing campaign that terrorized the nation. The bombings, which spanned from 1978 to 1995, resulted in the death of three people and injuries to 23 others."\n\n[Scene 3] - "Transition to scenes of Americans mailing packages and boarding airplanes, hinting at the substan

In [53]:
# Use GPT To generate prompts the script
from dotenv import load_dotenv
load_dotenv()
from langchain import OpenAI


In [54]:
llm = OpenAI(temperature=0.7)

In [55]:
llm(f"""
What is this script about:
{script}
""")

'\nThis script is about the life of Theodore Kaczynski, commonly known as the Unabomber. It recounts his bombing campaign and the impact of his manifesto on society. It also delves into his personal life, exploring the motives behind his actions. The script ends with a reflection on the consequences of his life, and the power of a single individual to shape the world.'

In [63]:
response = llm(f"""
Given the following script,
your task is to analyze the general mood of the script and create appropriate 
background music for it.
You should generate one description per scene in the script.

Respond in the following format (JSON array) and one prompt per scene.

[
    "Upbeat rock guitar solo, with modern electronic music elements",
    "Slow classical cello playing with city scape background",
    ...
]


Here is the script, make sure you respond in valid JSON, and only respond with the music descriptions:
{script}
""")
response

'\n["Soft piano playing with layers of ambient sound", \n"Haunting strings and a deep drum beat that builds throughout the scene", \n"Slow, somber string orchestra with a quiet drone in the background", \n"Mournful woodwinds with sparse electronic elements", \n"Suspenseful music with pulsing drums and eerie synths", \n"Dark, distorted electric guitar solo with a brooding bassline", \n"Atmospheric strings and synths with a distant choir in the background"]'

In [64]:
import json
music_prompts = json.loads(response)
music_prompts

['Soft piano playing with layers of ambient sound',
 'Haunting strings and a deep drum beat that builds throughout the scene',
 'Slow, somber string orchestra with a quiet drone in the background',
 'Mournful woodwinds with sparse electronic elements',
 'Suspenseful music with pulsing drums and eerie synths',
 'Dark, distorted electric guitar solo with a brooding bassline',
 'Atmospheric strings and synths with a distant choir in the background']

In [None]:
# Regenerate music prompts if dumb AI didn't generate things properly
if len(music_prompts) < len(scenes):
    pass


In [47]:
import sys
sys.path.append('..')
from audiocraft.models import MusicGen
from utils.utils import get_scenes
from utils import utils
utils.SCRIPT = "outputs/ted/scriptwriter_out.txt"
utils.VOICEOVER_TIMECODES = "outputs/ted/script_timecodes.txt"
utils.VOICEOVER_WAV_FILE = "outputs/ted/script.wav"

In [48]:
scenes = get_scenes()

In [49]:
scenes

[Scene(start_time=None, duration=None, scene_title='[Scene 1', description='- "A simple, somber room with a portrait of Silvio Berlusconi on the wall. The camera slowly zooms in on the portrait."', content='NARRATOR: "Silvio Berlusconi, the four-time Prime Minister of Italy, despite his numerous scandals, has died at the age of 86. He passed away at the San Raffaele hospital in Milan. His health had been deteriorating, and he was suffering from a rare form of blood cancer. However, the precise cause of his death remains unconfirmed."\n\n'),
 Scene(start_time=None, duration=None, scene_title='[Scene 2', description='- "A montage of clips showcasing Berlusconi\'s political career and business achievements. Images of newspaper headlines detailing his scandals are interspersed."', content='NARRATOR: "Berlusconi was the longest-serving Prime Minister in post-war Italy, from 1994 to 2011. His reign was not without controversy - sex scandals, corruption cases, and more. Yet before his politic

In [51]:
scenes_outputs = []
for scene, mp in zip(scenes, music_prompts):
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=scene.duration
    )    
    scenes_outputs.append(model.generate(
        descriptions=[
            mp,
        ],
        progress=True
    ))


TypeError: '<=' not supported between instances of 'NoneType' and 'int'

In [None]:
from audiocraft.utils.notebook import display_audio

display_audio(scenes_outputs[0], sample_rate=32000)


In [None]:
import subprocess

def combine_music_with_crossfade(music_paths, output_path, crossfade_duration=3):
    """
    Combine multiple music clips into a single file with crossfade.

    :param music_paths: List of paths to the music clips.
    :param output_path: Path to the output file.
    :param crossfade_duration: Duration of the crossfade in seconds (default is 3 seconds).
    """

    # Basic input string
    input_str = ""
    
    # Building the input string
    for music_path in music_paths:
        input_str += f" -i {music_path}"

    # Building filter_complex string
    filter_complex_str = ""
    for i in range(1, len(music_paths)):
        if i == 1:
            filter_complex_str += f"[0:a][{i}:a]acrossfade=d={crossfade_duration}:c1=tri:c2=squ[ac{i}];"
        else:
            filter_complex_str += f"[ac{i-1}][{i}:a]acrossfade=d={crossfade_duration}:c1=tri:c2=squ[ac{i}];"
    
    # Removing last semicolon
    filter_complex_str = filter_complex_str.rstrip(';')

    # Building the final FFmpeg command
    cmd = f"ffmpeg {input_str} -filter_complex \"{filter_complex_str}\" -map \"[ac{len(music_paths) - 1}]\" {output_path}"

    # Run the FFmpeg command
    subprocess.run(cmd, shell=True)

In [None]:
from audiocraft.data.audio import audio_write
music_files = []
for i, s in enumerate(scenes_outputs):
    filename = f"music-{i}.wav"
    music_files.append(filename)
    audio_write(
            filename, s[0].to("cpu"), model.sample_rate, strategy="loudness",
            loudness_headroom_db=16, add_suffix=False)

In [None]:
combine_music_with_crossfade(music_files, "background-music.wav")

In [None]:
from pydub import AudioSegment

def duck(voiceover_path, music_path, output_path, duck_dB=-10.0, threshold=-40, chunk_duration=100, fade_duration=500, hold_duration=500):
    """
    Perform audio ducking: lower the volume of the background music during speech segments in the voiceover.

    :param voiceover_path: Path to the voiceover audio file.
    :param music_path: Path to the background music file.
    :param output_path: Path where the output file should be saved.
    :param duck_dB: The level to duck the background music in dB (default is -10.0 dB).
    :param threshold: Threshold for detecting speech in dB (default is -40 dB).
    :param chunk_duration: Chunk duration in milliseconds used for speech detection (default is 100 ms).
    :param fade_duration: Duration in milliseconds for fade-in and fade-out (default is 500 ms).
    :param hold_duration: Duration in milliseconds to keep the ducking after speech ends (default is 500 ms).
    """

    # Load the audio files
    voiceover = AudioSegment.from_file(voiceover_path)
    background_music = AudioSegment.from_file(music_path)

    # Analyze voiceover to find segments of speech
    num_chunks = len(voiceover) // chunk_duration
    is_speech = [voiceover[i * chunk_duration : (i + 1) * chunk_duration].dBFS > threshold for i in range(num_chunks)]

    # Duck the background music during speech
    output = AudioSegment.empty()
    last_chunk_was_speech = False
    hold_counter = 0
    for i, speech in enumerate(is_speech):
        chunk_start = i * chunk_duration
        chunk_end = (i + 1) * chunk_duration
        chunk = background_music[chunk_start : chunk_end]

        if speech:
            ducked_chunk = chunk + duck_dB
            if not last_chunk_was_speech:
                ducked_chunk = ducked_chunk.fade_in(fade_duration)
            output += ducked_chunk
            hold_counter = hold_duration // chunk_duration
        else:
            if last_chunk_was_speech or hold_counter > 0:
                output += (chunk + duck_dB).fade_out(fade_duration) if hold_counter == 0 else chunk + duck_dB
                hold_counter = max(0, hold_counter - 1)
            else:
                output += chunk

        last_chunk_was_speech = speech

    # Combine the voiceover and background music
    combined = voiceover.overlay(output)

    # Save the final output
    combined.export(output_path, format="wav")

In [None]:
duck("ted/script.wav", "background-music.wav", "output-audio.wav",chunk_duration=1000, hold_duration=1500)