In [11]:
import librosa
import torchaudio
import numpy as np
import random
# Define the frame size and sampling rate
frame_size = 32000  # for example, 1 second of audio at 16 kHz
sampling_rate = 16000  # assuming both files have the same sampling rate


speech_audio_path = "data/musan/musan/speech/librivox/speech-librivox-0004.wav"
music_audio_path = "data/musan/musan/music/fma/music-fma-0006.wav"
def change_volume(audio_signal, sr, change_db):
    # Calculate the amplitude ratio from the decibel change
    amplitude_ratio = 10 ** (change_db / 20)
    # Apply the amplitude ratio to the audio signal
    return audio_signal * amplitude_ratio

def normalize_audio(audio_signal):
    # Find the maximum absolute value in the signal
    max_val = np.max(np.abs(audio_signal))
    # Scale the signal so that the maximum absolute value is 1.0
    normalized_signal = audio_signal / max_val
    return normalized_signal
# Load the music and speech files
music, sr_music= librosa.load(music_audio_path , sr=sampling_rate, mono=True)
speech, sr_speech = librosa.load(speech_audio_path, sr=sampling_rate, mono=True)

# Randomly choose a decibel change between -5 dB and 5 dB for variation
db_change_music = random.uniform(-5, 5)
db_change_speech = random.uniform(-5, 5)

# Change the volume of music and speech
music = change_volume(music, sr_music, db_change_music)
speech = change_volume(speech, sr_speech, db_change_speech)


music = normalize_audio(music)
speech = normalize_audio(speech)

# Select a random frame from the music file
start_frame = random.randint(0, len(music) - frame_size)
music_frame = music[start_frame:start_frame + frame_size]

# If the speech file is shorter than the music frame, pad it with zeros
if len(speech) < frame_size:
    speech = np.pad(speech, (0, max(0, frame_size - len(speech))), 'constant')

# Mix the music frame with the speech signal
mixed_signal = (music_frame + speech[:frame_size]) / 2

# Convert the mixed signal to mono if it's stereo using torchaudio
if mixed_signal.ndim == 2:
    mixed_signal = torchaudio.transforms.DownmixMono(channels_first=True)(mixed_signal)

import soundfile as sf
# Save the mixed signal to a new audio file
sf.write('mixed_audio_file2.wav', mixed_signal, sampling_rate)