In [None]:
!pip install pydub librosa 

In [None]:
!pip install openunmix soundfile

In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [4]:
from pydub import AudioSegment
import librosa

In [None]:
!ffmpeg -version 

In [6]:
def calculate_bpm(file_path):
    """
    Calculate the BPM of an audio file using librosa.

    Args:
    file_path (str): Path to the audio file.

    Returns:
    float: Estimated BPM of the audio.
    """
    y, sr = librosa.load(file_path, sr=None)

    onset_env = librosa.onset.onset_strength(y=y, sr=sr)

    # Estimate the tempo
    tempo, _ = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)

    return tempo

In [7]:
def change_bpm(audio, original_bpm, target_bpm):
    """
    Changes the BPM of an audio file by adjusting the speed.
    This will also change the pitch of the audio.

    Args:
    audio (AudioSegment): The audio segment to adjust.
    original_bpm (float): The original BPM of the audio.
    target_bpm (float): The target BPM to achieve.

    Returns:
    AudioSegment: The modified audio segment with new BPM.
    """
    ratio = target_bpm / original_bpm

    new_audio = audio.speedup(playback_speed=ratio)

    return new_audio

In [8]:
import torch
import librosa
import soundfile as sf
from openunmix import predict
import numpy as np

def separate_vocals(input_file, output_dir):
    # Load the audio file
    audio, rate = librosa.load(input_file, sr=44100, mono=False)
    
    # If the audio is mono, convert it to stereo
    if audio.ndim == 1:
        audio = np.repeat(audio.reshape(1, -1), 2, axis=0)
    
    # Separate the audio
    estimates = predict.separate(torch.tensor(audio[None]), rate=rate)
    
    # Save the separated tracks
    for source, estimate in estimates.items():
        sf.write(f"{output_dir}/{source}.mp3", estimate[0].T, rate)
    
    print(f"Separated tracks saved in {output_dir}")

In [9]:
def overlay(trackList: list):
    """
    Overlays multiple audio tracks together.

    Args:
    trackList (list): List of audio tracks to overlay.

    Returns:
    mixed_audio (AudioSegment): The mixed audio segment.
    """
    audio = AudioSegment.from_file(trackList[0], "mp3")

    for track in trackList[1:]:
        audio2 = AudioSegment.from_file(track, "mp3")
        audio = audio.overlay(audio2)

    return audio

In [10]:
import os 

def mix(track1, track2, track1_start, track1_duration, track2_mix_entry, track2_start, track2_duration, overlap_time, offset):
    """
    Mixes two audio tracks together with a specified overlap time.

    Args:
    track1 (str): Path to the first audio track.
    track2 (str): Path to the second audio track.
    track1_duration (float): Duration of snippet from first track (in seconds). Includes duration to be used in the overlap.
    track2_duration (float): Duration of snippet from first track (in seconds).
    overlap_time (float): Duration of the overlap in seconds.
    track1_start (float): Start time of the first track in seconds.
    track2_start (float): Start time of the second track

    Intermediary Values:
    mix: Fadeout of Song 1 + Overlay + Fadein of Song 2


    Returns:
    final_mix (AudioSegment): The mixed audio segment.
    """
    audio1 = AudioSegment.from_file(track1, "wav")
    audio2 = AudioSegment.from_file(track2, "wav")

    rampup = audio1[track1_start: track1_start + track1_duration - overlap_time]

    if not os.path.exists("t1/vocals.mp3"):
        separate_vocals(track1, "t1")
    if not os.path.exists("t2/vocals.mp3"):
        separate_vocals(track2, "t2")

    t1_vocals = AudioSegment.from_file("t1/vocals.mp3", "mp3")
    t2_music = overlay(['t2/bass.mp3', 't2/drums.mp3', 't2/other.mp3'])
    
    rampdown = audio2[track2_start: track2_start + track2_duration]

    mix_vocals = t1_vocals[track1_start + track1_duration - overlap_time: track1_start + track1_duration]
    mix_music = AudioSegment.silent(duration=offset) + t2_music[track2_mix_entry: track2_mix_entry + overlap_time - offset]
    mix_audio = mix_music.overlay(mix_vocals)
    
    combined_mix = rampup + mix_audio + rampdown

    return combined_mix

In [None]:
# TODO: Change these values to match the input audio files. Might need some finetuning to get right.
track1_file = "baby.mp4"
track2_file = "dk.mp4"
track1_start = 2200
track1_duration = 20100
track2_mix_entry = 6100
track2_start = 20800
track2_duration = 8500
overlap_time = 15450
offset = 800

final_mix = mix(track1_file, track2_file, track1_start, track1_duration, track2_mix_entry, track2_start, track2_duration, overlap_time, offset)
final_mix.export("mix.mp3", format="mp3")