In [1]:
# import base libraries
import pandas as pd

# pytorch
import torch

# sklearn
from sklearn.model_selection import train_test_split

# jupyter notebook configuration
import warnings
from IPython.display import Audio
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# google disc path
data_path = '/content/drive/MyDrive/UrbanSound8K/'
audio_path = '/content/drive/MyDrive/UrbanSound8K/audio/'

# metadata file
metadata_file = data_path + '/metadata/UrbanSound8K.csv'
df = pd.read_csv(metadata_file)
df.head()

In [None]:
print(f'metadata shape: {df.shape}')

In [None]:
# exctract values
files = df["slice_file_name"].values.tolist()
folds = df["fold"].values
labels = df["classID"].values.tolist()

In [None]:
class SundDS(Dataset):
    def __init__(self, file_path, class_id):
        self.file_path = file_path
        self.class_id = class_id
        self.sr = 44100

    def __getitem__(self, idx):
        path = self.file_path[idx]

        # load audio
        waveform, sr = torchaudio.load(path, normalize=True)

        # convert sterio to mono
        mono = torch.mean(waveform, dim=0, keepdim=True)

        data = torch.zeros([1, self.sr])
        if mono.numel() < self.sr:
            data[:, :mono.numel()] = mono
        else:
            data = mono[:, :self.sr]

        mel_spec = torchaudio.transforms.MelSpectrogram(sr)(mono)
        
        # mel-frequency cepstrum coefficients
        mfcc = torchaudio.transforms.MFCC(sample_rate=sr)(mono)

        # create features
        features = torch.cat([mel_spec, mfcc], axis=1)

        return {"spec": torch.tensor(features[0].permute(1, 0), dtype=torch.float),
                "label": torch.tensor(self.class_id[idx], dtype=torch.long)}

In [None]:
# create path with training files
path = [os.path.join(audio_path + "fold" + str(folder) + "/" + file) for folder, file in zip(folds, files)]

X_train, X_test, y_train, y_test = train_test_split(path, labels, random_state=42, test_size=0.2)