In [None]:
!pip install datasets tqdm huggingface_hub

In [None]:
from datasets import load_dataset, concatenate_datasets
import torch
import torchaudio
import numpy as np
import torchaudio.transforms as T
import random
import IPython.display as ipd
from scipy import signal
import os
from huggingface_hub import HfApi
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Audio, display
from huggingface_hub import login, upload_folder
from datasets.features import Audio  # Use this import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# attach specific path
%cd /content/drive/MyDrive/cs566

## Load Background Sounds

In [None]:
import os
import urllib.request
import zipfile

# Download and extraction path
download_path = '../data/esc50'

# Create directory for the dataset
os.makedirs(download_path, exist_ok=True)

# Download the zip file
zip_path = os.path.join(download_path, 'esc50.zip')
urllib.request.urlretrieve(
    'https://github.com/karolpiczak/ESC-50/archive/master.zip',
    zip_path
)

# Extract the contents to the same directory
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(download_path)

print(f"ESC-50 dataset downloaded and extracted to {download_path}")

In [None]:
# Define the path to the extracted ESC-50 dataset
base_path = "data/esc50/ESC-50-master"
audio_path = os.path.join(base_path, 'audio')
meta_path = os.path.join(base_path, 'meta', 'esc50.csv')

# Load the metadata
metadata = pd.read_csv(meta_path)
# Print the available categories to debug
print(metadata.head())
print("Available categories in the metadata:")
print(metadata['category'].unique())

In [None]:
# Define natural soundscapes & water sounds
natural_sound_classes = [
    'rain', 'sea_waves',
    'crackling_fire', 'thunderstorm', 'wind', 'water_drops',
    'pouring_water'
]

# Define exterior/urban noises
urban_sound_classes = [
    'train', 'siren', 'airplane',
    'engine', 'helicopter'
]
# Filter to get only the desired categories
natural_sounds = metadata[metadata['category'].isin(natural_sound_classes)]
urban_sounds = metadata[metadata['category'].isin(urban_sound_classes)]
selected_sounds = pd.concat([natural_sounds, urban_sounds])

print(f"Found {len(selected_sounds)} files in the selected categories")

In [None]:
# Function to load a random background clip from ESC-50
def load_random_background(audio_path, selected_sounds, metadata_df=None):
    """
    Loads a random audio clip from the selected sounds.

    Parameters:
        audio_path (str): Path to the ESC-50 audio directory
        selected_sounds (pd.DataFrame): DataFrame containing the filtered sounds to choose from
        metadata_df (pd.DataFrame, optional): Full metadata DataFrame for fallback

    Returns:
        tuple: (waveform, sample_rate, info_string)
    """
    assert(len(selected_sounds) > 0)
    random_row = selected_sounds.sample(1).iloc[0]

    # Get the filename
    filename = random_row['filename']
    category = random_row['category']

    # Load the audio file
    file_path = os.path.join(audio_path, filename)
    waveform, sample_rate = torchaudio.load(file_path)

    return waveform, sample_rate, f"{filename} ({category})"

In [None]:
def mix_with_background(query, query_sample_rate, background, background_sample_rate, mix_ratio=0.3):
    """
    Mixes query audio with background audio at the specified ratio.

    Parameters:
        query (torch.Tensor): The main audio tensor
        query_sample_rate (int): Sample rate of the main audio
        background (torch.Tensor): The background audio tensor
        background_sample_rate (int): Sample rate of the background audio
        mix_ratio (float): Ratio of background to main audio (0.0 to 1.0)

    Returns:
        torch.Tensor: Mixed audio tensor as a 1D array
    """
    # Ensure query is a tensor with proper dimensions
    if not isinstance(query, torch.Tensor):
        query = torch.tensor(query, dtype=torch.float32)
    if len(query.shape) == 1:
        query = query.unsqueeze(0)  # Add channel dimension

    # Ensure background is a tensor with proper dimensions
    if not isinstance(background, torch.Tensor):
        background = torch.tensor(background, dtype=torch.float32)
    if len(background.shape) == 1:
        background = background.unsqueeze(0)  # Add channel dimension

    # Ensure both have the same sample rate
    if query_sample_rate != background_sample_rate:
        background = torchaudio.functional.resample(
            background, orig_freq=background_sample_rate, new_freq=query_sample_rate
        )

    # Now safely get dimensions
    query_length = query.shape[1]
    bg_length = background.shape[1]

    if bg_length > query_length:
        # If background is longer, extract a random segment
        start = random.randint(0, bg_length - query_length)
        background = background[:, start:start+query_length]
    elif bg_length < query_length:
        # If background is shorter, loop it
        num_repeats = (query_length // bg_length) + 1
        background = background.repeat(1, num_repeats)
        background = background[:, :query_length]

    # Mix the audio
    mixed = (1.0 - mix_ratio) * query + mix_ratio * background

    # Normalize to prevent clipping
    max_val = torch.max(torch.abs(mixed))
    if max_val > 1.0:
        mixed = mixed / max_val

    # This removes the channel dimension, making it a 1D array
    mixed = mixed.squeeze(0)

    return mixed

In [None]:
fma_dataset = load_dataset("benjamin-paine/free-music-archive-small", split="train")
fma_dataset

In [None]:
fma_dataset = fma_dataset.select_columns(['audio','title','genres'])
fma_dataset

In [None]:
def extract_random_segment(audio_dict, segment_duration=5):
    """
    Extracts a random segment of the specified duration from an audio dictionary.

    Args:
        audio_dict (dict): Dictionary containing 'array' (numpy array of audio samples)
                           and 'sampling_rate' (int).
        segment_duration (int): Duration of the segment to extract in seconds.

    Returns:
        torch.Tensor: A tensor containing the extracted segment.
    """
    audio_array = audio_dict['array']
    sampling_rate = audio_dict['sampling_rate']

    total_samples = len(audio_array)
    segment_samples = segment_duration * sampling_rate

    if segment_samples > total_samples:
        raise ValueError("Audio is shorter than the requested segment duration")

    # Select a random starting index
    start_sample = np.random.randint(0, total_samples - segment_samples + 1)
    end_sample = start_sample + segment_samples

    # Extract segment
    segment = audio_array[start_sample:end_sample]

    # Convert to PyTorch tensor
    return torch.tensor(segment, dtype=torch.float32)

In [None]:
def apply_noise(audio_tensor, noise_level=0.02):
    """
    Adds Gaussian noise to the audio tensor.

    Parameters:
        audio_tensor (torch.Tensor): The input audio tensor
        noise_level (float): The standard deviation of the noise (typically 0.001 to 0.05)
                            Higher values make noise more noticeable.
                            Values around -20dB to -30dB (0.01 to 0.03) are effective
                            for anti-detection

    Returns:
        torch.Tensor: Audio tensor with added noise
    """
    noise = noise_level * torch.randn_like(audio_tensor)
    return audio_tensor + noise

In [None]:
def apply_pitch_shift_optimized(audio_tensor, sampling_rate, min_semitones=0.17, max_semitones=1.5):
    """Memory-optimized pitch shifting for short clips"""
    import gc

    # Calculate parameters
    n_steps = random.uniform(min_semitones, max_semitones)
    if random.choice([True, False]):
        n_steps = -n_steps

    # Process in a memory-conscious way
    with torch.no_grad():  # Disable gradient tracking
        pitch_shift = torchaudio.transforms.PitchShift(
            sample_rate=sampling_rate, n_steps=n_steps
        )
        result = pitch_shift(audio_tensor.unsqueeze(0)).squeeze(0)

    # Force cleanup
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()  # Clear CUDA cache if using GPU

    return result

In [None]:
def eq_filter_channel(audio_np, sampling_rate, bass_gain, mid_gain, treble_gain):
    """Helper function for apply_eq_masking to process a single channel"""
    # Define filter bands (in Hz)
    bass_cutoff = 250
    treble_cutoff = 4000

    # Convert gains from dB to linear scale
    bass_gain_linear = 10 ** (bass_gain / 20)
    mid_gain_linear = 10 ** (mid_gain / 20)
    treble_gain_linear = 10 ** (treble_gain / 20)

    # Apply lowpass filter for bass
    b_bass, a_bass = signal.butter(4, bass_cutoff / (sampling_rate / 2), 'lowpass')
    bass = signal.filtfilt(b_bass, a_bass, audio_np) * bass_gain_linear

    # Apply bandpass filter for mids
    b_mid, a_mid = signal.butter(4, [bass_cutoff / (sampling_rate / 2),
                                     treble_cutoff / (sampling_rate / 2)], 'bandpass')
    mid = signal.filtfilt(b_mid, a_mid, audio_np) * mid_gain_linear

    # Apply highpass filter for treble
    b_treble, a_treble = signal.butter(4, treble_cutoff / (sampling_rate / 2), 'highpass')
    treble = signal.filtfilt(b_treble, a_treble, audio_np) * treble_gain_linear

    # Combine the filtered signals
    return bass + mid + treble

def apply_eq_masking(audio_tensor, sampling_rate, bass_gain=6.0, mid_gain=-3.0, treble_gain=2.0):
    """
    Applies equalization (EQ) to modify frequency ranges, altering the audio fingerprint.

    Parameters:
        audio_tensor (torch.Tensor): The input audio tensor
        sampling_rate (int): The sampling rate of the audio
        bass_gain (float): Gain in dB for bass frequencies (< 250Hz)
        mid_gain (float): Gain in dB for mid frequencies (250Hz-4kHz)
        treble_gain (float): Gain in dB for treble frequencies (> 4kHz)

    Returns:
        torch.Tensor: Equalized audio tensor
    """
    # Convert tensor to numpy for filtering
    is_stereo = (audio_tensor.dim() > 1)
    if is_stereo:
        audio_np = audio_tensor.numpy()
        result = np.zeros_like(audio_np)
        for i in range(audio_tensor.shape[0]):  # Process each channel
            result[i] = eq_filter_channel(audio_np[i], sampling_rate, bass_gain, mid_gain, treble_gain)
    else:
        audio_np = audio_tensor.numpy()
        result = eq_filter_channel(audio_np, sampling_rate, bass_gain, mid_gain, treble_gain)

    # Convert back to PyTorch tensor
    return torch.from_numpy(result.astype(np.float32))

In [None]:
def play_audio(audio_array, sampling_rate):
    """
    Plays an audio array in Colab.

    Args:
        audio_array (np.ndarray or torch.Tensor): The audio waveform.
        sampling_rate (int): The sample rate of the audio.
    """
    # Convert to NumPy if it's a PyTorch tensor
    if isinstance(audio_array, torch.Tensor):
        audio_array = audio_array.numpy()

    return ipd.Audio(audio_array, rate=sampling_rate)


In [None]:
def convert_to_audio_format(audio_data, sampling_rate):
    """
    Convert audio data (tensor or numpy array) to the format expected by
    the Hugging Face Datasets Audio feature.

    Parameters:
        audio_data: torch.Tensor or numpy.ndarray - The audio data
        sampling_rate: int - The sampling rate of the audio

    Returns:
        dict: A dictionary in the format expected by the Audio feature
    """
    # If it's a PyTorch tensor, convert to numpy
    if not isinstance(audio_data, np.ndarray):
        # Move to CPU if on GPU
        if audio_data.is_cuda:
            audio_data = audio_data.cpu()
        # Convert to numpy
        audio_np = audio_data.numpy()
    else:
        audio_np = audio_data

    assert(isinstance(audio_np, np.ndarray))
    # Create the dictionary format expected by the Audio feature
    audio_dict = {
        "array": audio_np,
        "sampling_rate": sampling_rate
    }

    return audio_dict

In [None]:
def get_query(example,idx):
    corpus_audio = example["audio"]
    sample_rate = corpus_audio["sampling_rate"]

    # randomly extract a 5 second snippet from the audio
    query_control = extract_random_segment(corpus_audio, segment_duration=5)

    # create pitch shifted version
    query_pitch = apply_pitch_shift_optimized(query_control, sample_rate)

    # EQ masking
    query_eq = apply_eq_masking(query_control, sample_rate)

    # load random background and create version with background mixed in
    background, bg_sample_rate, bg_info = load_random_background(
        audio_path=audio_path,
        selected_sounds=selected_sounds
    )

    query_back = mix_with_background(
        query=query_control,
        query_sample_rate=sample_rate,
        background=background,
        background_sample_rate=bg_sample_rate,
        mix_ratio=0.3
    )

    # Convert all tensors to numpy arrays and format them for Audio feature
    query_control_array = convert_to_audio_format(query_control, sample_rate)
    query_eq_array = convert_to_audio_format(query_eq, sample_rate)
    query_pitch_array = convert_to_audio_format(query_pitch, sample_rate)
    query_back_array = convert_to_audio_format(query_back, sample_rate)

    return {
        "pid": f"song_{idx}",
        "qid": f"qid_{idx}",
        "q_audio": query_control_array,
        "q_audio_eq": query_eq_array,
        "q_audio_pitch": query_pitch_array,
        "q_audio_back": query_back_array,
        "q_audio_back_info": bg_info
    }

## Unit Test of Augmentations

In [None]:
# Cast each new column to Audio feature
ds_sample = fma_dataset.select(range(10))
ds_sample = ds_sample.map(get_query,with_indices=True)

# test cast to Audio type
ds_sample = ds_sample.cast_column("q_audio", Audio())
ds_sample = ds_sample.cast_column("q_audio_eq", Audio())
ds_sample = ds_sample.cast_column("q_audio_pitch", Audio())
ds_sample = ds_sample.cast_column("q_audio_back", Audio())

In [None]:
# For more detailed inspection
for column_name, feature_type in ds_sample.features.items():
    print(f"Column: {column_name}, Type: {type(feature_type).__name__}")

In [None]:
idx=3
ds_example_0 = ds_sample[idx]
ds_example_0_q_audio_array = ds_example_0["q_audio"]["array"]
ds_example_0_q_audio_eq_array = ds_example_0["q_audio_eq"]["array"]
ds_example_0_q_audio_pitch_array = ds_example_0["q_audio_pitch"]["array"]
ds_example_0_q_audio_back_array = ds_example_0["q_audio_back"]["array"]

# print types of each
print(type(ds_example_0_q_audio_array))
print(type(ds_example_0_q_audio_eq_array))
print(type(ds_example_0_q_audio_pitch_array))
print(type(ds_example_0_q_audio_back_array))

In [None]:
# the "document" or source audio to retrieve
example_audio = ds_sample[idx]["audio"]
play_audio(example_audio["array"], example_audio["sampling_rate"])

In [None]:
# Control query (just random 5 second clip)
play_audio(ds_example_0_q_audio_array,ds_example_0['q_audio']['sampling_rate'])

In [None]:
# query eq
play_audio(ds_example_0_q_audio_eq_array,ds_example_0['q_audio_eq']['sampling_rate'])

In [None]:
# pitch shifted
play_audio(ds_example_0_q_audio_pitch_array,ds_example_0['q_audio_pitch']['sampling_rate'])

In [None]:
# query background
play_audio(ds_example_0_q_audio_back_array,ds_example_0['q_audio_back']['sampling_rate'])

## Apply to whole dataset

In [None]:
login(token="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")

In [None]:
fma_dataset = fma_dataset.map(get_query, batch_size=100, writer_batch_size=100,with_indices=True)

# test cast to Audio type
fma_dataset = fma_dataset.cast_column("q_audio", Audio())
fma_dataset = fma_dataset.cast_column("q_audio_eq", Audio())
fma_dataset = fma_dataset.cast_column("q_audio_pitch", Audio())
fma_dataset = fma_dataset.cast_column("q_audio_back", Audio())

# save locally
fma_dataset.save_to_disk("data/fma_dataset")

# upload to huggingface
fma_dataset.push_to_hub("ryanleeme17/free-music-archive-retrieval",private=False)

In [None]:
# test cast to Audio type
fma_dataset = fma_dataset.cast_column("q_audio", Audio())
fma_dataset = fma_dataset.cast_column("q_audio_eq", Audio())
fma_dataset = fma_dataset.cast_column("q_audio_pitch", Audio())
fma_dataset = fma_dataset.cast_column("q_audio_back", Audio())

In [None]:
# save locally
fma_dataset.save_to_disk("data/fma_dataset")

In [None]:
# For more detailed inspection
for column_name, feature_type in fma_dataset.features.items():
    print(f"Column: {column_name}, Type: {type(feature_type).__name__}")

In [None]:
from datasets.features import Audio  # Use this import
fma_dataset = fma_dataset.cast_column("q_audio", Audio())
fma_dataset = fma_dataset.cast_column("q_audio_eq", Audio())
fma_dataset = fma_dataset.cast_column("q_audio_pitch", Audio())
fma_dataset = fma_dataset.cast_column("q_audio_back", Audio())

In [None]:
# save locally
fma_dataset.save_to_disk("../data/fma_dataset")
# upload to huggingface
