In [None]:
import torch
import torchaudio
import torchaudio.functional as F

print(torch.__version__)
print(torchaudio.__version__)

import os
import random

from IPython.display import Audio
import matplotlib.pyplot as plt

In [None]:
print(os.getcwd())
print(os.listdir("../../../datasets/GTZAN/gtzan_genre/genres/blues"))

In [None]:
root = '../../../datasets/GTZAN/gtzan_genre/genres/'
genres = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]

test_list = []
for genre in genres:
    song = random.choice(os.listdir(root + genre))
    audio, sr = torchaudio.load(os.path.join(root, genre, song))
    test_list.append(['test_audio_' + str(genre), audio, sr])

print(test_list)

In [None]:
for audio in test_list:
    print(audio[0])
    Audio(audio[1], rate=audio[2])

In [None]:
def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform[c], linewidth=1)
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
        if xlim:
            axes[c].set_xlim(xlim)
    figure.suptitle(title)
    plt.show(block=False)

In [None]:
def play_audio(waveform, sample_rate):
  waveform = waveform.numpy()

  num_channels, num_frames = waveform.shape
  if num_channels == 1:
    display(Audio(waveform[0], rate=sample_rate))
  elif num_channels == 2:
    display(Audio((waveform[0], waveform[1]), rate=sample_rate))
  else:
    raise ValueError("Waveform with more than 2 channels are not supported.")

In [None]:
for i in range(len(test_list)):
    plot_waveform(test_list[i][1], test_list[i][2], title=str(test_list[i][0]), xlim=None)
    play_audio(test_list[i][1], test_list[i][2])

In [None]:
import audio_augmentations

In [None]:

transforms = [
    RandomApply([PolarityInversion()], p=0.8),
    RandomApply([Noise(min_snr=0.001, max_snr=0.005)], p=0.3),
    RandomApply([Gain()], p=0.2),
    HighLowPass(sample_rate=sr), # this augmentation will always be applied in this aumgentation chain!
    RandomApply([Delay(sample_rate=sr)], p=0.5),
    RandomApply([PitchShift()], p=0.4),
    RandomApply([Reverb(sample_rate=sr)], p=0.3)
]

In [None]:
transform = Compose(transforms=transforms)
transformed_audio =  transform(audio)
Audio(transformed_audio, rate=sr)

In [None]:
transforms = [
    RandomResizedCrop(n_samples=num_samples),
    RandomApply([PolarityInversion(), Noise(min_snr=0.001, max_snr=0.005)], p=0.8),
    RandomApply([Gain()], p=0.2),
    RandomApply([Delay(sample_rate=sr), Reverb(sample_rate=sr)], p=0.5)
]

In [None]:
transform = Compose(transforms=transforms)
transformed_audio =  transform(audio)
ipd.Audio(transformed_audio, rate=sr)

In [None]:
transform = ComposeMany(transforms=transforms, num_augmented_samples=4)
transformed_audio_many = transform(audio)
transformed_audio == transformed_audio_many
#transformed_audio.shape = [4, num_channels, num_samples]