In [23]:
import torch
import os
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
from IPython.display import Audio

In [39]:
if torch.cuda.is_available():
    device = 'cuda'
else: device = 'cpu'
print(device)

cpu


In [29]:
ANNOTATIONS_PATH = 'data/urbansound8k/metadata/UrbanSound8K.csv'
AUDIO_PATH = 'data/urbansound8k/audio/'
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050

mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_fft=1024, hop_length=512, n_mels=64)

In [49]:
class UrbanSoundDataset(Dataset):
    def __init__(self, annotations_path, audio_path, target_sample_rate, num_samples, transformation, device):
        self.annotations = pd.read_csv(annotations_path)
        self.audio_path = audio_path
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
        self.device = device
        self.transformation = transformation.to(self.device)

    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, idx): #dataset[0] -> dataset.__getitem__[0] 
        audio_sample_path = self._get_audio_sample_path(idx)
        label = self._get_audio_sample_label(idx)
        signal, sample_rate = torchaudio.load(audio_sample_path)
        signal = signal.to(self.device)
        signal = self._resample_if_necessary(signal, sample_rate)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        return signal, label
    
    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal
    
    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal
    
    def _resample_if_necessary(self, signal, sample_rate):
        if self.target_sample_rate != sample_rate:
            resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate)
            signal = resampler(signal)
        return signal
    
    def _mix_down_if_necessary(self, signal): #to single channel         
        #signal(num_channels, samples) -> (2, 16000) -> (1, 16000)
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal
    
    def _get_audio_sample_path(self, idx):
        slice_file_name = self.annotations.iloc[idx, 0]
        fold = 'fold{0}'.format(self.annotations.iloc[idx, 5])
        path = os.path.join(self.audio_path, fold, slice_file_name)
        return path
    
    def _get_audio_sample_label(self, idx):
        return self.annotations.iloc[idx, 6]

In [50]:
dataset = UrbanSoundDataset(ANNOTATIONS_PATH, AUDIO_PATH, SAMPLE_RATE, NUM_SAMPLES, mel_spectrogram, device)

In [51]:
dataset.annotations[:3]

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing


In [48]:
dataset[0]

(tensor([[[8.1443e-04, 2.1588e-04, 9.1436e-04,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [2.4870e-03, 1.5724e-03, 4.2855e-04,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [2.9685e-03, 6.0449e-03, 4.1659e-03,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          ...,
          [1.7061e-04, 5.7996e-02, 7.9969e-01,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [2.1515e-04, 1.5781e-02, 3.7936e-01,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [3.0015e-04, 1.4416e-02, 3.1233e-01,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00]]]),
 3)