In [1]:
import torch
import torchaudio
import torchaudio.sox_effects as sox
import numpy as np
from IPython.display import Audio

In [2]:
sox.init_sox_effects()

  sox.init_sox_effects()


In [3]:
sox.effect_names()

['allpass',
 'band',
 'bandpass',
 'bandreject',
 'bass',
 'bend',
 'biquad',
 'chorus',
 'channels',
 'compand',
 'contrast',
 'dcshift',
 'deemph',
 'delay',
 'dither',
 'divide',
 'downsample',
 'earwax',
 'echo',
 'echos',
 'equalizer',
 'fade',
 'fir',
 'firfit',
 'flanger',
 'gain',
 'highpass',
 'hilbert',
 'loudness',
 'lowpass',
 'mcompand',
 'norm',
 'oops',
 'overdrive',
 'pad',
 'phaser',
 'pitch',
 'rate',
 'remix',
 'repeat',
 'reverb',
 'reverse',
 'riaa',
 'silence',
 'sinc',
 'speed',
 'stat',
 'stats',
 'stretch',
 'swap',
 'synth',
 'tempo',
 'treble',
 'tremolo',
 'trim',
 'upsample',
 'vad',
 'vol']

In [4]:
#Effect params
from torchaudio.utils.sox_utils import list_effects
list_effects()

{'allpass': 'frequency width[h|k|q|o]',
 'band': '[-n] center [width[h|k|q|o]]',
 'bandpass': '[-c] frequency width[h|k|q|o]',
 'bandreject': 'frequency width[h|k|q|o]',
 'bass': 'gain [frequency(100) [width[s|h|k|q|o]](0.5s)]',
 'bend': '[-f frame-rate(25)] [-o over-sample(16)] {start,cents,end}',
 'biquad': 'b0 b1 b2 a0 a1 a2',
 'chorus': 'gain-in gain-out delay decay speed depth [ -s | -t ]',
 'channels': 'number',
 'compand': "attack1,decay1{,attack2,decay2} [soft-knee-dB:]in-dB1[,out-dB1]{,in-dB2,out-dB2} [gain [initial-volume-dB [delay]]]\n\twhere {} means optional and repeatable and [] means optional.\n\tdB values are floating point or -inf'; times are in seconds.",
 'contrast': '[enhancement (75)]',
 'dcshift': 'shift [ limitergain ]\n\tThe peak limiter has a gain much less than 1.0 (ie 0.05 or 0.02) which\n\tis only used on peaks to prevent clipping. (default is no limiter)',
 'deemph': '',
 'delay': '{position}',
 'dither': '[-S|-s|-f filter] [-a] [-p precision]\n  (none)   U

In [5]:
waveform, sample_rate = torchaudio.load('/home/oriol_colome_font_epidemicsound_/Master-Thesis/datasets/GTZAN/gtzan_genre/genres/classical/classical.00018.wav')
waveform = waveform.mean(dim=0, keepdim=True)  # convert stereo to mono
Audio(waveform, rate=sample_rate)

In [6]:
# Define the effect parameters using numpy
gain = np.random.randint(-12, 0)
pitch = np.random.randint(-1200, 1200)
reverb_params = [np.random.randint(0, 100)] * 3
chorus_params = [
    round(np.random.uniform(0.1, 1.0), 1),
    round(np.random.uniform(0.1, 1.0), 1),
    np.random.randint(20, 55),
    round(np.random.uniform(0.1, 0.9), 1),
    round(np.random.uniform(0.1, 2.0), 2),
    np.random.randint(2, 5),
    np.random.choice(["-s", "-t"]),
]
drive = np.random.randint(0, 30)
stretch = round(np.random.uniform(0.8, 1.2), 1)
speed = np.random.uniform(0.7, 1.3)
tremolo_speed = np.random.uniform(0.1, 100)
tremolo_depth = np.random.randint(1, 101)

# Define the effect chain using f-strings
effects = [
    ["gain", "-n", f"{gain}"],
    ["chorus", *map(str, chorus_params)],
    ["overdrive", f"{drive}"],
    ["pitch", f"{pitch}"],
    ["reverb", *[str(param) for param in reverb_params]],
    ["speed", f"{speed}"],
    ["stretch", f"{stretch}"],
    ["tremolo", f"{tremolo_speed}", f"{tremolo_depth}"],
]

positive, _ = sox.apply_effects_tensor(waveform, sample_rate, effects)
positive = positive.mean(dim=0, keepdim=True)  # convert stereo to mono

In [11]:
Audio(positive, rate=sample_rate)

In [8]:
def generate_negative(positive, 
                      default_sample_rate: int = 44100,
                      min_chunk_duration_sec: float = 0.05,
                      max_chunk_duration_sec: float = 1.0):
    # Get positive length and duration
    positive_length = positive.shape[-1]
    positive_duration = positive_length / default_sample_rate

    # Determine the number of chunks based on minimum chunk duration
    n_chunks = int(positive_duration // min_chunk_duration_sec)

    # Calculate the minimum and maximum chunk lengths in samples
    min_chunk_length = int(min_chunk_duration_sec * default_sample_rate)
    max_chunk_length = int(max_chunk_duration_sec * default_sample_rate)

    # Generate random chunk lengths
    chunk_lengths = np.random.randint(min_chunk_length, max_chunk_length + 1, size=n_chunks - 1)
    chunk_lengths = np.append(chunk_lengths, positive_length - np.sum(chunk_lengths))

    # Split the positive clip into chunks
    chunks = [positive[..., start:start + length].clone().detach() for start, length in zip(np.cumsum(np.insert(chunk_lengths, 0, 0)), chunk_lengths)]

    # Shuffle the chunks
    np.random.shuffle(chunks)

    # Concatenate the shuffled chunks to create the negative example
    #TODO crossfade?
    negative = torch.cat(chunks, dim=-1)

    # Check if the positive and negative examples have the same length
    if positive.shape != negative.shape:
        raise ValueError(f"Input positive and output negative have different shapes: {positive.shape} vs {negative.shape}")

    return negative

In [9]:
negative_from_positive = generate_negative(positive)
#TODO generate negative from anchor as well?

In [10]:
 # Play the scrambled audio
Audio(negative_from_positive, rate=sample_rate)