In [204]:
"""
    Author: Moustafa Alzantot (malzantot@ucla.edu)
    All rights reserved.
"""
import torch
import collections
import os
import soundfile as sf
from torch.utils.data import DataLoader, Dataset
import numpy as np
from joblib import Parallel, delayed
import h5py

LOGICAL_DATA_ROOT = 'LA'
PHISYCAL_DATA_ROOT = 'data_physical'

ASVFile = collections.namedtuple('ASVFile',
    ['speaker_id', 'file_name', 'path', 'sys_id', 'key'])

class ASVDataset(Dataset):
    """ Utility class to load  train/dev datatsets """
    def __init__(self,
        LOGICAL_DATA_ROOT='LA',
        LOGICAL_PROTOCOL_DIR='LA',
        transform=None, 
        is_train=True,
        is_eval=False,
        sample_size=None,
        feature_name=None,
        eval_part=0):
        data_root = LOGICAL_DATA_ROOT
        track = 'LA'

        assert feature_name is not None, 'must provide feature name'
        self.track = track
        self.prefix = 'ASVspoof2019_{}'.format(track)
        v1_suffix = ''

        self.sysid_dict = {
            'human': 0,  # bonafide speech
            'spoof': 1, # Spoofed signal
        }
        
        self.is_eval = is_eval
        self.sysid_dict_inv = {v:k for k,v in self.sysid_dict.items()}
        self.data_root = data_root
        self.dset_name = 'eval' if is_eval else 'train' if is_train else 'dev'
        self.protocols_fname = 'eval_short.trn' if is_eval else 'train_short.trn' if is_train else 'dev_short.trn'
        self.protocols_dir = os.path.join(self.data_root,
            '{}_protocols/'.format(self.prefix))
        
        self.protocols_dir = LOGICAL_PROTOCOL_DIR
        
        self.files_dir = os.path.join(self.data_root, '{}_{}'.format(
            self.prefix, 'train')+v1_suffix, 'flac') # we just take from the train folder, sorry
        self.protocols_fname = os.path.join(self.protocols_dir,
            'ASVspoof2019.{}.cm.{}.txt'.format(track, self.protocols_fname))
        self.cache_fname = 'cache_{}_{}_{}.npy'.format(self.dset_name, track, feature_name)
        
        self.transform = transform
        if os.path.exists(self.cache_fname):
            self.data_x, self.data_y, self.data_sysid, self.files_meta = torch.load(self.cache_fname)
            print('Dataset loaded from cache ', self.cache_fname)
        else:
            self.files_meta = self.parse_protocols_file(self.protocols_fname)
            data = list(map(self.read_file, self.files_meta))
            self.data_x, self.data_y, self.data_sysid = map(list, zip(*data))
            if self.transform:
                # self.data_x = list(map(self.transform, self.data_x)) 
                self.data_x = Parallel(n_jobs=4, prefer='threads')(delayed(self.transform)(x) for x in self.data_x)
            torch.save((self.data_x, self.data_y, self.data_sysid, self.files_meta), self.cache_fname)
            print('Dataset saved to cache ', self.cache_fname)
        if sample_size:
            select_idx = np.random.choice(len(self.files_meta), size=(sample_size,), replace=True).astype(np.int32)
            self.files_meta= [self.files_meta[x] for x in select_idx]
            self.data_x = [self.data_x[x] for x in select_idx]
            self.data_y = [self.data_y[x] for x in select_idx]
            self.data_sysid = [self.data_sysid[x] for x in select_idx]
        self.length = len(self.data_x)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        x = self.data_x[idx]
        y = self.data_y[idx]
        return x, y, self.files_meta[idx]

    def read_file(self, meta):
        #data_x, sample_rate = sf.read(meta.path)
        data_x = read_and_preprocess_file(meta.path)
        data_y = meta.key
        # MODIFYING THIS TO INT 
        return data_x, int(data_y), meta.sys_id

    def _parse_line(self, line):
        tokens = line.strip().split(' ')
        print("In parse")
        print(line)
        return ASVFile(speaker_id=tokens[0],
            file_name=tokens[1],
            path=os.path.join(self.files_dir, tokens[1] + '.flac'),
            sys_id=self.sysid_dict[tokens[3]],
            key=int(tokens[3] == 'human'))

    def parse_protocols_file(self, protocols_fname):
        lines = open(protocols_fname).readlines()
        files_meta = map(self._parse_line, lines)
        return list(files_meta)


# if __name__ == '__main__':
#    train_loader = ASVDataset(LOGICAL_DATA_ROOT, is_train=True)
#    assert len(train_loader) == 25380, 'Incorrect size of training set.'
#    dev_loader = ASVDataset(LOGICAL_DATA_ROOT, is_train=False)
#    assert len(dev_loader) == 24844, 'Incorrect size of dev set.'

In [148]:
import librosa

int16_max = (2 ** 15) - 1

def read_and_preprocess_file(path: str, sampling_rate=16000) -> np.array:
    """
    Read audio file to spectrogram.
    By reading it to some other intermediate form first.
    ...I don't know, i just hope this works, i basically have no idea what i'm doing. :D
    By default, uses the same sampling rate that was used in AutoSpeech.
    
    Params:
        path: file location
        sampling_rate: desired sampling rate (will resample the supplied wav if needed)
        
    Retruns:
        spectrogram of supplied file
    """
    wav, sr = librosa.load(path, sr=sampling_rate)
    wav = normalize_volume(wav, increase_only=True)
    spectrogram = wav_to_spectrogram(wav) # obtain (freq, time) spectrogram
    # Resample the spectrogram to bound its duration.
    # It must be transposed to (time, freq) first
    spectrogram = adjust_duration(spectrogram.T, partial_n_frames=300)
    # Re-transpose to get it back to (freq, time) shape
    return spectrogram.T 
    
    

def normalize_volume(wav, target_dBFS=-30, increase_only=False, decrease_only=False):
    """
    Normalizes volume of supplied wav.
    I literally copied this from autospeech and changed target_dBFS to their default value.
    """
    if increase_only and decrease_only:
        raise ValueError("Both increase only and decrease only are set")
    rms = np.sqrt(np.mean((wav * int16_max) ** 2))
    wave_dBFS = 20 * np.log10(rms / int16_max)
    dBFS_change = target_dBFS - wave_dBFS
    if dBFS_change < 0 and increase_only or dBFS_change > 0 and decrease_only:
        return wav
    return wav * (10 ** (dBFS_change / 20))


def wav_to_spectrogram(wav, sampling_rate=16000, window_step=10, window_length=25, n_fft=512):
    """
    Returns: (freq, time)
    """
    frames = np.abs(librosa.core.stft(
        wav,
        n_fft=n_fft,
        hop_length=int(sampling_rate * window_step / 1000),
        win_length=int(sampling_rate * window_length / 1000),
    ))
    
    return frames.astype(np.float32)

def adjust_duration(feature, partial_n_frames=10000):
    """
    Re-adapted from AutoSpeech again...
    """
    if feature.shape[0] <= partial_n_frames:
        start = 0
        while feature.shape[0] < partial_n_frames:
            feature = np.repeat(feature, 2, axis=0)
    else:
        start = np.random.randint(0, feature.shape[0] - partial_n_frames)
    end = start + partial_n_frames
    return feature[start:end]

In [195]:
ds_train = ASVDataset(feature_name='questo_train', is_train=True)

Dataset loaded from cache  cache_train_LA_questo_train.npy


In [199]:
ds_dev = ASVDataset(feature_name='questo_devvv', is_train=False)

In parse
flac LA_T_5163922 A03 spoof

['flac', 'LA_T_5163922', 'A03', 'spoof']
In parse
flac LA_T_4269497 A02 spoof

['flac', 'LA_T_4269497', 'A02', 'spoof']
In parse
flac LA_T_1036998 A02 spoof

['flac', 'LA_T_1036998', 'A02', 'spoof']
In parse
flac LA_T_3781714 A01 spoof

['flac', 'LA_T_3781714', 'A01', 'spoof']
In parse
flac LA_T_3051735 A06 spoof

['flac', 'LA_T_3051735', 'A06', 'spoof']
In parse
flac LA_T_5148309 A01 spoof

['flac', 'LA_T_5148309', 'A01', 'spoof']
In parse
flac LA_T_6143831 A01 spoof

['flac', 'LA_T_6143831', 'A01', 'spoof']
In parse
flac LA_T_8570091 A05 spoof

['flac', 'LA_T_8570091', 'A05', 'spoof']
In parse
flac LA_T_5261598 A05 spoof

['flac', 'LA_T_5261598', 'A05', 'spoof']
In parse
flac LA_T_8555100 A01 spoof

['flac', 'LA_T_8555100', 'A01', 'spoof']
In parse
flac LA_T_3141223 human human

['flac', 'LA_T_3141223', 'human', 'human']
In parse
flac LA_T_9039580 A02 spoof

['flac', 'LA_T_9039580', 'A02', 'spoof']
In parse
flac LA_T_1724630 A01 spoof

['flac', '

Dataset saved to cache  cache_dev_LA_questo_devvv.npy


In [201]:
ds_eval = ASVDataset(feature_name='questo_eval', is_train=False, is_eval=True)

In parse
flac LA_T_8211829 A06 spoof

In parse
flac LA_T_7155774 A01 spoof

In parse
flac LA_T_9073495 A04 spoof

In parse
flac LA_T_4209095 A04 spoof

In parse
flac LA_T_2946398 A01 spoof

In parse
flac LA_T_6246134 A03 spoof

In parse
flac LA_T_3780870 A04 spoof

In parse
flac LA_T_3965629 A06 spoof

In parse
flac LA_T_2088865 A02 spoof

In parse
flac LA_T_9580070 A05 spoof

In parse
flac LA_T_7698914 human human

In parse
flac LA_T_6655535 A02 spoof

In parse
flac LA_T_3164524 A01 spoof

In parse
flac LA_T_8185768 A06 spoof

In parse
flac LA_T_1054176 A03 spoof

In parse
flac LA_T_9162513 human human

In parse
flac LA_T_2834141 A04 spoof

In parse
flac LA_T_9493769 A06 spoof

In parse
flac LA_T_1659801 A03 spoof

In parse
flac LA_T_4294152 A02 spoof

In parse
flac LA_T_2217523 human human

In parse
flac LA_T_8318685 A03 spoof

In parse
flac LA_T_9366267 A04 spoof

In parse
flac LA_T_6927360 A01 spoof

In parse
flac LA_T_6639919 A03 spoof

In parse
flac LA_T_4408598 A01 spoof

In par

Dataset saved to cache  cache_eval_LA_questo_eval.npy


In [203]:
cv2.imshow('final', ds_eval[0][0]) 
cv2.waitKey(0)   
cv2.destroyAllWindows()  