In [1]:
import pandas as pd
import csv
import json
import wave
import torchaudio
import numpy as np
import torch
import torch.nn.functional
from torch.utils.data import Dataset
import random
import argparse

In [2]:
test_df = pd.read_csv('../../data/labels.csv')
test_df.head()

Unnamed: 0,track,algorithm
0,00050dd7458cf08e594c797930696bce.wav,4
1,00070e7c531000d3dddc735d107275a9.wav,2
2,000f0711027a69b7f3886c2dbcb7d41f.wav,3
3,001e28e66dee24408aaf3480dfb95fbe.wav,1
4,001eee950f60613869544b72cd48fe97.wav,2


In [3]:
_df = test_df.copy()
_df['track'] = _df['track'].apply(lambda x: '../../data/spcup_2022_training_part1/'+x)
_df.head()

Unnamed: 0,track,algorithm
0,../../data/spcup_2022_training_part1/00050dd74...,4
1,../../data/spcup_2022_training_part1/00070e7c5...,2
2,../../data/spcup_2022_training_part1/000f07110...,3
3,../../data/spcup_2022_training_part1/001e28e66...,1
4,../../data/spcup_2022_training_part1/001eee950...,2


In [4]:
args = argparse.Namespace()

args.data_train = './final_train_data.json'
args.data_val = './final_test_data.json'
args.data_test = _df
args.data_eval = '.json'
args.n_class = 6
args.model = 'ast'
args.dataset = 'speechcommands'
args.exp_dir = '.'
args.lr = 0.001
args.optim = 'adam'
args.batch_size = 4
args.num_workers =4
args.n_epochs = 3
args.lr_patience = 2
args.n_print_steps = 100
args.save_model = None # 
args.freqm = 0
args.timem = 0
args.mixup = 0
args.bal = False
args.fstride = 10
args.tstride = 10
args.imagenet_pretrain = True
args.audioset_pretrain = True
args.noise_level = 0.1

In [5]:
# dataset spectrogram mean and std, used to normalize the input
norm_stats = {'audioset':[-4.2677393, 4.5689974], 'esc50':[-6.6268077, 5.358466], 'speechcommands':[-6.845978, 5.5654526]}
target_length = {'audioset':1024, 'esc50':512, 'speechcommands':128}
# if add noise for data augmentation, only use for speech commands
noise = {'audioset': False, 'esc50': False, 'speechcommands':True}

val_audio_conf = {'num_mel_bins': 128, 'target_length': target_length[args.dataset], 'freqm': 0, 'timem': 0, 'mixup': 0, 'dataset': args.dataset, 'mode':'evaluation', 'mean':norm_stats[args.dataset][0], 'std':norm_stats[args.dataset][1], 'noise':False}


In [6]:
class AudioTestDataset(Dataset):
    def __init__(self, dataset_df, audio_conf):
        """
        Dataset that manages audio recordings
        :param audio_conf: Dictionary containing the audio loading and preprocessing settings
        :param dataset_json_file
        """

        self.data = list(dataset_df.loc[:, ['track']].values)
        self.audio_conf = audio_conf
        print(
            '---------------the {:s} dataloader---------------'.format(self.audio_conf.get('mode')))
        self.melbins = self.audio_conf.get('num_mel_bins')
        self.freqm = self.audio_conf.get('freqm')
        self.timem = self.audio_conf.get('timem')
        print('now using following mask: {:d} freq, {:d} time'.format(
            self.audio_conf.get('freqm'), self.audio_conf.get('timem')))
        self.mixup = self.audio_conf.get('mixup')
        print('now using mix-up with rate {:f}'.format(self.mixup))
        self.dataset = self.audio_conf.get('dataset')
        print('now process ' + self.dataset)
        # dataset spectrogram mean and std, used to normalize the input
        self.norm_mean = self.audio_conf.get('mean')
        self.norm_std = self.audio_conf.get('std')
        # skip_norm is a flag that if you want to skip normalization to compute the normalization stats using src/get_norm_stats.py, if Ture, input normalization will be skipped for correctly calculating the stats.
        # set it as True ONLY when you are getting the normalization stats.
        self.skip_norm = self.audio_conf.get(
            'skip_norm') if self.audio_conf.get('skip_norm') else False
        if self.skip_norm:
            print(
                'now skip normalization (use it ONLY when you are computing the normalization stats).')
        else:
            print('use dataset mean {:.3f} and std {:.3f} to normalize the input.'.format(
                self.norm_mean, self.norm_std))
        # if add noise for data augmentation
        self.noise = self.audio_conf.get('noise')
        self.noise_lvl = self.audio_conf.get('noise_level')
        if self.noise == True:
            print('now use noise augmentation')

        # self.index_dict = make_index_dict(label_csv)
        self.label_num = 6  # len(self.index_dict)
        print('number of classes is {:d}'.format(self.label_num))

    def _wav2fbank(self, filename, filename2=None):
        # mixup
        print(filename)
        if filename2 == None:
            waveform, sr = torchaudio.load(filename)
            waveform = waveform - waveform.mean()
        # mixup
        else:
            waveform1, sr = torchaudio.load(filename)
            waveform2, _ = torchaudio.load(filename2)

            waveform1 = waveform1 - waveform1.mean()
            waveform2 = waveform2 - waveform2.mean()

            if waveform1.shape[1] != waveform2.shape[1]:
                if waveform1.shape[1] > waveform2.shape[1]:
                    # padding
                    temp_wav = torch.zeros(1, waveform1.shape[1])
                    temp_wav[0, 0:waveform2.shape[1]] = waveform2
                    waveform2 = temp_wav
                else:
                    # cutting
                    waveform2 = waveform2[0, 0:waveform1.shape[1]]

            # sample lambda from uniform distribution
            #mix_lambda = random.random()
            # sample lambda from beta distribtion
            mix_lambda = np.random.beta(10, 10)

            mix_waveform = mix_lambda * waveform1 + \
                (1 - mix_lambda) * waveform2
            waveform = mix_waveform - mix_waveform.mean()

        fbank = torchaudio.compliance.kaldi.fbank(waveform, htk_compat=True, sample_frequency=sr, use_energy=False,
                                                  window_type='hanning', num_mel_bins=self.melbins, dither=0.0, frame_shift=10)

        target_length = self.audio_conf.get('target_length')
        n_frames = fbank.shape[0]

        p = target_length - n_frames

        # cut and pad
        if p > 0:
            m = torch.nn.ZeroPad2d((0, 0, 0, p))
            fbank = m(fbank)
        elif p < 0:
            fbank = fbank[0:target_length, :]

        if filename2 == None:
            return fbank, 0
        else:
            return fbank, mix_lambda

    def __getitem__(self, index):
        """
        returns: image, audio, nframes
        where image is a FloatTensor of size (3, H, W)
        audio is a FloatTensor of size (N_freq, N_frames) for spectrogram, or (N_frames) for waveform
        nframes is an integer
        """
        # do mix-up for this sample (controlled by the given mixup rate)
        if random.random() < self.mixup:
            datum = self.data[index]
            # find another sample to mix, also do balance sampling
            # sample the other sample from the multinomial distribution, will make the performance worse
            # mix_sample_idx = np.random.choice(len(self.data), p=self.sample_weight_file)
            # sample the other sample from the uniform distribution
            mix_sample_idx = random.randint(0, len(self.data)-1)
            mix_datum = self.data[mix_sample_idx]
            # get the mixed fbank
            fbank, mix_lambda = self._wav2fbank(datum, mix_datum)
            # initialize the label
            # label_indices = np.zeros(self.label_num)
            # add sample 1 labels
            # for label_str in datum['labels'].split(','):
            #     label_indices[int(self.index_dict[label_str])] += mix_lambda
            # label_indices[datum['labels']] += mix_datum
            # add sample 2 labels
            # for label_str in mix_datum['labels'].split(','):
            #     label_indices[int(self.index_dict[label_str])] += 1.0-mix_lambda
            # label_indices[mix_datum['labels']] += (1.0 - mix_lambda)
            # label_indices = torch.FloatTensor(label_indices)
        # if not do mixup
        else:
            datum = self.data[index]
            # label_indices = np.zeros(self.label_num)
            fbank, mix_lambda = self._wav2fbank(datum)
            # for label_str in datum['labels'].split(','):
            #     label_indices[int(self.index_dict[label_str])] = 1.0
            # label_indices[datum['labels']] += 1.0

            # label_indices = torch.FloatTensor(label_indices)

        # SpecAug, not do for eval set
        # freqm = torchaudio.transforms.FrequencyMasking(self.freqm)
        # timem = torchaudio.transforms.TimeMasking(self.timem)
        # fbank = torch.transpose(fbank, 0, 1)
        # if self.freqm != 0:
        #     fbank = freqm(fbank)
        # if self.timem != 0:
        #     fbank = timem(fbank)
        # fbank = torch.transpose(fbank, 0, 1)

        # normalize the input for both training and test
        if not self.skip_norm:
            fbank = (fbank - self.norm_mean) / (self.norm_std * 2)
        # skip normalization the input if you are trying to get the normalization stats.
        else:
            pass

        # if self.noise == True:
        #     fbank = fbank + torch.rand(fbank.shape[0], fbank.shape[1]) * (self.noise_lvl**0.5)  # Add gaussian noise with configured noise level
        #     fbank = torch.roll(fbank, np.random.randint(-10, 10), 0)

        mix_ratio = min(mix_lambda, 1-mix_lambda) / \
            max(mix_lambda, 1-mix_lambda)

        # the output fbank shape is [time_frame_num, frequency_bins], e.g., [1024, 128]
        return fbank #, label_indices

    def __len__(self):
        return len(self.data)


In [7]:
test_loader = torch.utils.data.DataLoader(
    AudioTestDataset(args.data_test, audio_conf=val_audio_conf),
    batch_size=2, shuffle=False, num_workers=args.num_workers, pin_memory=False
)

---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process speechcommands
use dataset mean -6.846 and std 5.565 to normalize the input.
number of classes is 6


In [8]:
for data in test_loader:
    print(data)
    break