In [None]:
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
import torch
import os

In [None]:
class UrbanSoundDataset(Dataset): # inheritance 

    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples):
        self.annotations =  pd.read_csv(annotations_file) # pandas read the information in the csv file containing the annotations
        self.audio_dir = audio_dir
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)

    def __getimtem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        # load audio file
        signal, sr = torchaudio.load(audio_sample_path)
        # maybe the signal can me a stereo signal, so we want it to be mono
        # also the data could have different sample rate, we want to resample in order to have signals with the same one
        signal = self._resample_if_necessary(signal, sr)
        signal = self._mix_down_if_necessary(signal)
        # check if the num of samples is the same as the target one
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal) # equivalent to mel_spectrogram(signal)
        return signal, label

    # a_list[1] -> a_list.__getitem__(1)

    def _get_audio_sample_path(self, index):
        # corresponds in the colon with index 5 in the csv
        fold = f"fold{self.annotations.iloc[index, 5]}" # we get the number of fold in which the sample is from the annotation file .csv
        # now join the path
        path = os.path.join(self.audio_dir, fold, self.annotations.iloc[index, 0])
        return path
    
    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 6]

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal) # callable object, very nice
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1: 
            # signal -> (num_channerls, samples) -> (2, 16000) -> (1, 16000) aggregate across dim = 0
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        if signal.shape[1] < self.num_samples:
            diff = self.num_samples - signal.shape[1]
            last_dim_padding = (0, diff) # left padding and right padding
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal


In [None]:
ANNOTATIONS_FILE = "..."
AUDIO_DIR = "..."
usd = UrbanSoundDataset(ANNOTATIONS_FILE, AUDIO_DIR)

print(f"There are {len(usd)} samples in the dataset.")

signal, label = usd[0]

In [None]:
SAMPLE_RATE = 16000

mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft = 1024,
    hop_length = 512,
    n_mels = 64
)

# signal = mel_spectrogram(signal) callable object

# new
usd = UrbanSoundDataset(ANNOTATIONS_FILE, AUDIO_DIR, mel_spectrogram, SAMPLE_RATE)
signal, label = usd[0]


## pre-processing audio with different durations

training data should be fixed in shape

In [None]:
NUM_SAMPLES = 22050
SAMPLE_RATE = 22050

usd = UrbanSoundDataset(ANNOTATIONS_FILE, AUDIO_DIR, mel_spectrogram, SAMPLE_RATE, NUM_SAMPLES)

