In [23]:
import torch
import os
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
from IPython.display import Audio

In [24]:
ANNOTATIONS_PATH = 'data/urbansound8k/metadata/UrbanSound8K.csv'
AUDIO_PATH = 'data/urbansound8k/audio/'
SAMPLE_RATE = 16000

mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_fft=1024, hop_length=512, n_mels=64)

In [25]:
class UrbanSoundDataset(Dataset):
    def __init__(self, annotations_path, audio_path, sample_rate, transformation):
        self.annotations = pd.read_csv(annotations_path)
        self.audio_path = audio_path
        self.target_sample_rate = sample_rate
        self.transformation = transformation
        
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, idx): #dataset[0] -> dataset.__getitem__[0] 
        audio_sample_path = self._get_audio_sample_path(idx)
        label = self._get_audio_sample_label(idx)
        signal, sample_rate = torchaudio.load(audio_sample_path)
        signal = self._resample_if_necessary(signal, sample_rate)
        signal = self._mix_down_if_necessary(signal)
        signal = self.transformation(signal)
        return signal, label
    
    def _resample_if_necessary(self, signal, sample_rate):
        if self.target_sample_rate != sample_rate:
            resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate)
            signal = resampler(signal)
        return signal
    
    def _mix_down_if_necessary(self, signal): #to single channel         
        #signal(num_channels, samples) -> (2, 16000) -> (1, 16000)
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal
    
    def _get_audio_sample_path(self, idx):
        slice_file_name = self.annotations.iloc[idx, 0]
        fold = 'fold{0}'.format(self.annotations.iloc[idx, 5])
        path = os.path.join(self.audio_path, fold, slice_file_name)
        return path
    
    def _get_audio_sample_label(self, idx):
        return self.annotations.iloc[idx, 6]

In [26]:
dataset = UrbanSoundDataset(ANNOTATIONS_PATH, AUDIO_PATH, SAMPLE_RATE, mel_spectrogram)

In [27]:
dataset.annotations[:3]

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing


In [28]:
dataset[0]

(tensor([[[5.9252e-04, 8.3836e-04, 9.5463e-05, 1.0172e-03, 4.7910e-04,
           1.6123e-04, 9.2117e-05, 1.5897e-04, 5.1361e-05, 1.3561e-04],
          [1.1820e-03, 1.0683e-03, 1.1457e-03, 6.4863e-03, 4.1269e-03,
           1.4093e-03, 7.9711e-04, 5.6833e-04, 3.6605e-04, 6.6183e-04],
          [3.0207e-03, 3.7172e-03, 4.7084e-03, 5.5482e-03, 2.2801e-03,
           1.8242e-03, 2.9318e-03, 2.5542e-03, 2.2006e-03, 2.9955e-03],
          [1.5446e-03, 8.2648e-03, 5.8799e-03, 2.0113e-02, 1.0480e-03,
           7.9804e-04, 1.8259e-03, 1.8869e-03, 2.7578e-03, 3.2022e-03],
          [7.4168e-04, 3.5674e-03, 3.6353e-02, 5.9763e-02, 3.1323e-03,
           3.9150e-03, 5.1391e-04, 4.6460e-04, 6.8626e-04, 1.5702e-03],
          [1.9557e-03, 3.2061e-02, 1.3300e-01, 1.6586e-01, 3.4333e-02,
           9.3193e-03, 1.6575e-03, 1.2765e-03, 2.2374e-03, 1.1519e-03],
          [1.4192e-03, 7.4623e-02, 3.3315e-02, 9.7097e-02, 5.7333e-02,
           1.1877e-02, 1.0860e-02, 2.8191e-03, 6.4316e-03, 2.3993e-03],