In [2]:
import os
import random
import numpy as np
import librosa
import librosa.display
import soundfile as sf

from pydub import AudioSegment, silence
from dotenv import load_dotenv
from pathlib import Path

# Load environment variables
load_dotenv()

def load_wav(input_file, return_file=True):
    """
    Converts an audio file to a .wav file, if needed. Stores the output wav
    file with the same name and directory as the input file.

    Parameters
    ----------
    input_file : str
        Path to the input audio file.
    output_file : str
        Path to store the output .wav file.
    """

    input_file = 'argh.mp3'

    # Get the file extension of the input file
    file_path, file_type = input_file.split('.')[0], \
        input_file.split('.')[-1]

     # Load the audio file
    file_nonwav = AudioSegment.from_file(input_file, format=file_type)
    # Convert the input file to a .wav file
    file_wav = file_nonwav.export(
        Path(f"{file_path}.wav"),
        format="wav"
    )

    if return_file:
        return file_wav


def overlay_audio(audio1, audio2, SNR=None, overlay_dir=None):
    """
    Overlay audio2 over audio1 at a specified SNR,
    that is dB, either same, quieter or louder.
    Both audio files must be in the same format.

    Parameters
    ----------
    audio1 : str
        Path to the first audio file.
    audio2 : str
        Path to the second audio file. The file that will be overlayed.
    SNR : int
        Signal-to-Noise Ratio in dB. Default is None.
        Positive integers will make audio2 louder, negative integers will
        make it quieter than audio1.

    Returns
    -------
    file_handle : AudioSegment
        AudioSegment object of the overlayed audio.
    """

    # Load the audio files
    sound1 = load_wav(audio1, return_file=True)
    sound2 = load_wav(audio2, return_file=True)

    # Check if SNR is specified
    if SNR is not None:

        # Check if SNR is an integer
        if not isinstance(SNR, int):
            raise ValueError("SNR must be an integer.")
        else:
            # Make audio2 louder or quieter
            sound2 = sound2 + 6
            # set SNR
            SNR_overlay = SNR
    else:
        # Default SNR is 0
        SNR_overlay = 0

    # Overlay sound2 over sound1 at the specified position
    overlay = sound1.overlay(sound2, position=0)

    # Save the overlayed audio
    if overlay_dir is not None:

        # Set the names of the audio file
        audio1_name = audio1.split('/')[-1].split('.')[0]
        audio2_name = audio2.split('/')[-1].split('.')[0]
        overlay_name = (f"{audio1_name}_{audio2_name}_"
                        f"{SNR_overlay}.wav")

        # Export the overlayed audio
        overlay.export(overlay_dir + overlay_name, format="wav")

    return overlay


def extract_audio_excerpts(audio_file, output_path, clip_length_sec):
    sound = AudioSegment.from_file(audio_file)
    duration_ms = len(sound)
    clip_length_ms = int(clip_length_sec * 1000)

    os.makedirs(output_path, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(audio_file))[0]
    clips = []
    print(clips)

    # Extract all possible non-overlapping clips
    for i, start_ms in enumerate(range(0, duration_ms - clip_length_ms + 1, clip_length_ms)):
        end_ms = start_ms + clip_length_ms
        clip = sound[start_ms:end_ms]
        file_name = f"{base_name}_{i:04d}_{start_ms}_{end_ms}.wav"
        output_file = os.path.join(output_path, file_name)
        clip.export(output_file, format="wav")
        clips.append(output_file)

    print(clips)
    return clips

extract_audio_excerpts(
    audio_file="bus09.wav",
    output_path="./",
    clip_length_sec=6
)

[]
['./bus09_0000_0_6000.wav', './bus09_0001_6000_12000.wav', './bus09_0002_12000_18000.wav', './bus09_0003_18000_24000.wav', './bus09_0004_24000_30000.wav']


['./bus09_0000_0_6000.wav',
 './bus09_0001_6000_12000.wav',
 './bus09_0002_12000_18000.wav',
 './bus09_0003_18000_24000.wav',
 './bus09_0004_24000_30000.wav']

In [None]:
import os
import numpy as np
import librosa
import glob
import soundfile as sf
import tempfile
from audio_rep_functions import overlay_audio, extract_audio_excerpts
import matplotlib.pyplot as plt

# --- PARAMETERS ---
target_sr = 16000
duration_sec = 6
n_fft = 1024
win_length = 1024
hop_length = 320
n_mels = 128
f_min = 0
f_max = 8000
SNR = 15
target_len = duration_sec * target_sr


def normalize_rms(y, target_rms=0.1):
    rms = np.sqrt(np.mean(y**2))
    if rms == 0:
        return y
    return y * (target_rms / rms)


def fix_length(y, target_len):
    if len(y) > target_len:
        return y[:target_len]
    else:
        return np.pad(y, (0, target_len - len(y)), mode='constant')


def compute_log_mel_spectrogram(audio_path, noise_path=None, visualize=False):
    # Load and resample audio
    y, sr = librosa.load(audio_path, sr=target_sr)
    if len(y) > target_len:
        y = y[:target_len]
    else:
        y = np.pad(y, (0, max(0, target_len - len(y))), mode='constant')

    # Overlay environmental noise if provided
    if noise_path:
        y_noisy = overlay_audio(audio_path, noise_path, SNR=SNR)
        # Convert to float32 if needed
        y_noisy = np.array(y_noisy.get_array_of_samples(), dtype=np.float32) / 32768.0
    else:
        y_noisy = y

    # Compute log mel spectrogram
    S = librosa.feature.melspectrogram(
        y=y_noisy,
        sr=target_sr,
        n_fft=n_fft,
        hop_length=hop_length,
        win_length=win_length,
        n_mels=n_mels,
        fmin=f_min,
        fmax=f_max
    )
    log_S = librosa.power_to_db(S, ref=np.max)

    # Normalize
    mean = np.mean(log_S)
    std = np.std(log_S)
    log_S_norm = (log_S - mean) / std

    # Optional visualization
    if visualize:
        plt.figure(figsize=(10, 4))
        librosa.display.specshow(log_S_norm, sr=target_sr, hop_length=hop_length, x_axis='time', y_axis='mel', fmax=fmax)
        plt.title('Normalized Log Mel Spectrogram')
        plt.colorbar(format='%+2.0f dB')
        plt.tight_layout()
        plt.show()

    return log_S_norm

# '/home/melan/supervised-vs-SSL/data/fma_data/fma_small/**/*.mp3'
main_files = glob.glob('/home/melan/supervised-vs-SSL/speech_data/*.mp3', recursive=True)
noise_files = glob.glob('/home/melan/supervised-vs-SSL/data/noise_data/scenes_stereo/*.wav', recursive=True)
output_dir = '/home/melan/supervised-vs-SSL/data/preprocessed/noisy_music/'
os.makedirs(output_dir, exist_ok=True)

noise_idx = 0

for main_file in main_files:
    # Extract 6s clips from main file
    main_clips = extract_audio_excerpts(main_file, '/tmp/main_clips/', clip_length_sec=6)
    for clip_path in main_clips:
        # Load and normalize main clip
        y_main, sr_main = librosa.load(clip_path, sr=target_sr)
        y_main_norm = y_main / (np.sqrt(np.mean(y_main**2)) + 1e-8) * 0.1
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_main:
            sf.write(tmp_main.name, y_main_norm, sr_main)
            main_norm_path = tmp_main.name

        # Pick noise file (loop if needed)
        noise_file = noise_files[noise_idx % len(noise_files)]
        noise_idx += 1

        # Load and normalize first 6s of noise
        y_noise, sr_noise = librosa.load(noise_file, sr=target_sr)
        y_noise = y_noise[:target_len] if len(y_noise) >= target_len else np.pad(y_noise, (0, target_len - len(y_noise)), mode='constant')
        y_noise_norm = y_noise / (np.sqrt(np.mean(y_noise**2)) + 1e-8) * 0.1
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_noise:
            sf.write(tmp_noise.name, y_noise_norm, sr_noise)
            noise_norm_path = tmp_noise.name

        # Overlay and save
        overlay_audio(
            audio1=main_norm_path,
            audio2=noise_norm_path,
            SNR=SNR,
            overlay_dir=output_dir
        )

        # Clean up temp files
        os.remove(main_norm_path)
        os.remove(noise_norm_path)

In [1]:
import torch

print(f"Is GPU available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")
print(f"GPU name: {torch.cuda.get_device_name(0)}")

Is GPU available: True
Number of GPUs: 1
GPU name: NVIDIA GeForce GTX 1060 3GB
