In [1]:
import sys
sys.path.append('..')
from audiocraft.models import MusicGen
from utils.utils import get_scenes
from utils import utils
from audiocraft.data.audio import audio_write

In [2]:
model = MusicGen.get_pretrained('medium')

In [3]:
utils.SCRIPT = "outputs/silvio/scriptwriter_out.txt"
script = utils.get_script()
script

'[Scene 1] - "A simple, somber room with a portrait of Silvio Berlusconi on the wall. The camera slowly zooms in on the portrait."\n\nNARRATOR: "Silvio Berlusconi, the four-time Prime Minister of Italy, despite his numerous scandals, has died at the age of 86. He passed away at the San Raffaele hospital in Milan. His health had been deteriorating, and he was suffering from a rare form of blood cancer. However, the precise cause of his death remains unconfirmed."\n\n[Scene 2] - "A montage of clips showcasing Berlusconi\'s political career and business achievements. Images of newspaper headlines detailing his scandals are interspersed."\n\nNARRATOR: "Berlusconi was the longest-serving Prime Minister in post-war Italy, from 1994 to 2011. His reign was not without controversy - sex scandals, corruption cases, and more. Yet before his political career, he was a successful businessman, owning television networks, publishing companies, and even saving the legendary football club, AC Milan, fr

In [4]:
# Use GPT To generate prompts the script
from dotenv import load_dotenv
load_dotenv()
from langchain import OpenAI


In [5]:
llm = OpenAI(temperature=0.7)

In [8]:
response = llm(f"""
Given the following script,
your task is to analyze the general mood of the script and create appropriate 
background music for it.
You should generate one description per scene in the script.

Respond in the following format (JSON array) and one prompt per scene.

[
    "Upbeat rock guitar solo, with modern electronic music elements",
    "Slow classical cello playing with city scape background",
    ...
]


Here is the script, make sure you respond in valid JSON, and only respond with the music descriptions:
{script}
""")
response

'\n[\n    "Soft piano playing with a somber undertone",\n    "A reflective instrumental piece with tense strings and a subtle beat",\n    "Mournful strings playing alongside a slow drum beat",\n    "Energetic orchestral piece with bright brass and a strong beat",\n    "A solemn piano playing with a reflective melody"\n]'

In [9]:
import json
music_prompts = json.loads(response)
music_prompts

['Soft piano playing with a somber undertone',
 'A reflective instrumental piece with tense strings and a subtle beat',
 'Mournful strings playing alongside a slow drum beat',
 'Energetic orchestral piece with bright brass and a strong beat',
 'A solemn piano playing with a reflective melody']

In [13]:
import sys
sys.path.append('..')
from audiocraft.models import MusicGen
from utils.utils import get_scenes
from utils import utils
utils.SCRIPT = "outputs/silvio/scriptwriter_out.txt"
utils.VOICEOVER_WAV_FILE="outputs/silvio/voiceover_out.wav"
utils.VOICEOVER_TIMECODES="outputs/silvio/voiceover_timecodes.txt"

In [14]:
scenes = get_scenes()

In [15]:
scenes

[Scene(start_time=0.0, duration=29.0, scene_title='[Scene 1', description='- "A simple, somber room with a portrait of Silvio Berlusconi on the wall. The camera slowly zooms in on the portrait."', content='NARRATOR: "Silvio Berlusconi, the four-time Prime Minister of Italy, despite his numerous scandals, has died at the age of 86. He passed away at the San Raffaele hospital in Milan. His health had been deteriorating, and he was suffering from a rare form of blood cancer. However, the precise cause of his death remains unconfirmed."\n\n'),
 Scene(start_time=29.0, duration=34.0, scene_title='[Scene 2', description='- "A montage of clips showcasing Berlusconi\'s political career and business achievements. Images of newspaper headlines detailing his scandals are interspersed."', content='NARRATOR: "Berlusconi was the longest-serving Prime Minister in post-war Italy, from 1994 to 2011. His reign was not without controversy - sex scandals, corruption cases, and more. Yet before his politica

In [17]:
scenes_outputs = []
for i, (scene, mp) in enumerate(zip(scenes, music_prompts)):
    print(f"Generating music for scene: {scene.scene_title} / Duration: {scene.duration}")
    model.set_generation_params(
        use_sampling=True,
        top_k=250,
        duration=min(30, scene.duration)
    )    
    output = model.generate(
        descriptions=[
            mp,
        ],
        progress=True
    )
    filename = f"music-{i}.wav"
    audio_write(filename,
               output[0].to("cpu"),
               model.sample_rate,
               strategy="loudness",
               loudness_headroom_db=16,
               add_suffix=False)
    
    


Generating music for scene: [Scene 1 / Duration: 29.0
Generating music for scene: [Scene 2 / Duration: 34.0
  1503 /   1503

CLIPPING music-1.wav happening with proba (a bit of clipping is okay): 0.0009791667107492685 maximum scale:  2.3683416843414307


Generating music for scene: [Scene 3 / Duration: 18.0
   903 /    903

CLIPPING music-2.wav happening with proba (a bit of clipping is okay): 1.0416666555101983e-05 maximum scale:  1.0105066299438477


Generating music for scene: [Scene 4 / Duration: 19.0
Generating music for scene: [Scene 5 / Duration: 48.0


CLIPPING music-3.wav happening with proba (a bit of clipping is okay): 3.2894736250455026e-06 maximum scale:  1.0211342573165894


  1503 /   1503

In [12]:
from audiocraft.utils.notebook import display_audio

display_audio(scenes_outputs[0], sample_rate=32000)


## The stuff below will be used in a Sound Engineer Agent I think

In [18]:
from pydub import AudioSegment

def pad_audio_with_fade(input_file, output_file, fade_duration=2000, silence_duration=0):
    # Load the wave file
    audio = AudioSegment.from_wav(input_file)

    # Create a fade in segment
    fade_segment = audio[:fade_duration].fade_in(fade_duration)

    # Create a silence segment with the specified duration
    silence_segment = AudioSegment.silent(duration=silence_duration)

    # Concatenate the fade in segment with the silence segment and the original audio
    padded_audio = fade_segment + silence_segment + audio

    # Export the padded audio to a new wave file
    padded_audio.export(output_file, format="wav")

In [19]:
from audiocraft.data.audio import audio_write
music_files = []
for i, s in enumerate(scenes):
    filename = f"music-{i}.wav"
    if s.duration > 30:
        pad_audio_with_fade(filename, filename, s.duration - 30)
    music_files.append(filename)

In [16]:
import subprocess

def combine_music_with_crossfade(music_paths, output_path, crossfade_duration=3):
    """
    Combine multiple music clips into a single file with crossfade.

    :param music_paths: List of paths to the music clips.
    :param output_path: Path to the output file.
    :param crossfade_duration: Duration of the crossfade in seconds (default is 3 seconds).
    """

    # Basic input string
    input_str = ""
    
    # Building the input string
    for music_path in music_paths:
        input_str += f" -i {music_path}"

    # Building filter_complex string
    filter_complex_str = ""
    for i in range(1, len(music_paths)):
        if i == 1:
            filter_complex_str += f"[0:a][{i}:a]acrossfade=d={crossfade_duration}:c1=tri:c2=squ[ac{i}];"
        else:
            filter_complex_str += f"[ac{i-1}][{i}:a]acrossfade=d={crossfade_duration}:c1=tri:c2=squ[ac{i}];"
    
    # Removing last semicolon
    filter_complex_str = filter_complex_str.rstrip(';')

    # Building the final FFmpeg command
    cmd = f"ffmpeg {input_str} -filter_complex \"{filter_complex_str}\" -map \"[ac{len(music_paths) - 1}]\" {output_path}"
    print(cmd)
    # Run the FFmpeg command
    subprocess.run(cmd, shell=True)

In [31]:
from audiocraft.data.audio import audio_write
music_files = []
for i, s in enumerate(scenes):
    filename = f"music-{i}.wav"
    if s.duration > 30:
        pad_audio_with_fade(filename, filename, s.duration - 30)
    music_files.append(filename)

In [32]:
music_files

['music-0.wav', 'music-1.wav', 'music-2.wav', 'music-3.wav', 'music-4.wav']

In [33]:
combine_music_with_crossfade(music_files, "background-music.wav")

ffmpeg  -i music-0.wav -i music-1.wav -i music-2.wav -i music-3.wav -i music-4.wav -filter_complex "[0:a][1:a]acrossfade=d=3:c1=tri:c2=squ[ac1];[ac1][2:a]acrossfade=d=3:c1=tri:c2=squ[ac2];[ac2][3:a]acrossfade=d=3:c1=tri:c2=squ[ac3];[ac3][4:a]acrossfade=d=3:c1=tri:c2=squ[ac4]" -map "[ac4]" background-music.wav


ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [34]:
from pydub import AudioSegment

def duck(voiceover_path, music_path, output_path, duck_dB=-10.0, threshold=-40, chunk_duration=100, fade_duration=500, hold_duration=500):
    """
    Perform audio ducking: lower the volume of the background music during speech segments in the voiceover.

    :param voiceover_path: Path to the voiceover audio file.
    :param music_path: Path to the background music file.
    :param output_path: Path where the output file should be saved.
    :param duck_dB: The level to duck the background music in dB (default is -10.0 dB).
    :param threshold: Threshold for detecting speech in dB (default is -40 dB).
    :param chunk_duration: Chunk duration in milliseconds used for speech detection (default is 100 ms).
    :param fade_duration: Duration in milliseconds for fade-in and fade-out (default is 500 ms).
    :param hold_duration: Duration in milliseconds to keep the ducking after speech ends (default is 500 ms).
    """

    # Load the audio files
    voiceover = AudioSegment.from_file(voiceover_path)
    background_music = AudioSegment.from_file(music_path)

    # Analyze voiceover to find segments of speech
    num_chunks = len(voiceover) // chunk_duration
    is_speech = [voiceover[i * chunk_duration : (i + 1) * chunk_duration].dBFS > threshold for i in range(num_chunks)]

    # Duck the background music during speech
    output = AudioSegment.empty()
    last_chunk_was_speech = False
    hold_counter = 0
    for i, speech in enumerate(is_speech):
        chunk_start = i * chunk_duration
        chunk_end = (i + 1) * chunk_duration
        chunk = background_music[chunk_start : chunk_end]

        if speech:
            ducked_chunk = chunk + duck_dB
            if not last_chunk_was_speech:
                ducked_chunk = ducked_chunk.fade_in(fade_duration)
            output += ducked_chunk
            hold_counter = hold_duration // chunk_duration
        else:
            if last_chunk_was_speech or hold_counter > 0:
                output += (chunk + duck_dB).fade_out(fade_duration) if hold_counter == 0 else chunk + duck_dB
                hold_counter = max(0, hold_counter - 1)
            else:
                output += chunk

        last_chunk_was_speech = speech

    # Combine the voiceover and background music
    combined = voiceover.overlay(output)

    # Save the final output
    combined.export(output_path, format="wav")

In [35]:
duck(utils.VOICEOVER_WAV_FILE, "background-music.wav", "output-audio.wav",duck_dB=-20, threshold=-40,chunk_duration=1000, hold_duration=1500)