We will analyze https://research.google.com/audioset///dataset/emergency_vehicle.html

# Preample

In [7]:
import json
import pandas as pd
import numpy as np
import os
import librosa
import IPython.display as ipd
import random
from sklearn.model_selection import train_test_split
import multiprocessing as mp
from tqdm import tqdm
from scipy.io import wavfile

import torch

In [5]:
# reproducibility
np.random.seed(1337)
random.seed(42)

# Train/Valid/Test split

In [3]:
path_train_unbalanced_negatives = "/nfs/students/summer-term-2020/project-4/data/dataset2/download/training_unbalanced/negative/"
path_train_unbalanced_positives = "/nfs/students/summer-term-2020/project-4/data/dataset2/download/training_unbalanced/positive/"

path_train_balanced_negatives = "/nfs/students/summer-term-2020/project-4/data/dataset2/download/training_balanced/negative/"
path_train_balanced_positives = "/nfs/students/summer-term-2020/project-4/data/dataset2/download/training_balanced/positive/"

path_eval_balanced_negatives = "/nfs/students/summer-term-2020/project-4/data/dataset2/download/eval_balanced/negative/"
path_eval_balanced_positives = "/nfs/students/summer-term-2020/project-4/data/dataset2/download/eval_balanced/positive/"

In [4]:
positive_class_paths = [path_train_unbalanced_positives,
                        path_train_balanced_positives,
                        path_eval_balanced_positives]

negative_class_paths = [path_train_unbalanced_negatives,
                        path_train_balanced_negatives,
                        path_eval_balanced_negatives]

In [5]:
def mergeSources(paths):
    class_filenames = []
    class_paths=[]

    for path in paths:
        (dirpath, dirnames, filenames) = next(os.walk(path))

        for filename in filenames:
            if filename == 'meta.csv':
                continue
                
            if filename not in class_filenames:
                class_filenames.append(filename)
                class_paths.append(dirpath + filename)
                
    return class_paths

In [6]:
positive_class = mergeSources(positive_class_paths)
negative_class = mergeSources(negative_class_paths)

In [7]:
print(len(positive_class))
print(len(negative_class))

12866
7803


In [8]:
# remove bad negative samples (we obsereved some files with zero length or only zeros as content)
def badSample(path):
    try:
        _, data = wavfile.read(path)
    except:
        return False
    all_elements_zero = len(np.nonzero(data)[0]) == 0
    zero_length = len(data) == 0
    return all_elements_zero or zero_length

In [17]:
def getGoodSamples(samples):
    good_samples = []

    for sample in tqdm(samples):
        if not badSample(sample):
            good_samples.append(sample)
    return good_samples

In [None]:
quality_checked_negative_class = getGoodSamples(negative_class)

 72%|███████▏  | 5626/7803 [05:16<02:14, 16.15it/s]

In [16]:
print(len(quality_checked_negative_class))

7801


In [14]:
quality_checked_positive_class = getGoodSamples(positive_class)

100%|██████████| 12866/12866 [10:57<00:00, 19.57it/s]


In [15]:
print(len(quality_checked_positive_class))

12864


In [18]:
# balancing positive class
random.seed(24)
balanced_class_size = len(quality_checked_negative_class)
balanced_positive_class = random.sample(quality_checked_positive_class, balanced_class_size)

In [19]:
# shuffle for similar distribution properties
random.seed(1337)
full_dataset = balanced_positive_class+quality_checked_negative_class
final_dataset = random.sample(full_dataset, len(full_dataset))

In [20]:
len(final_dataset)

15602

In [21]:
# final split
train_paths, valid_paths, test_paths = np.split(final_dataset, [int(.6*len(final_dataset)), int(.8*len(final_dataset))])

In [23]:
print(len(train_paths))
print(len(valid_paths))
print(len(test_paths))

9361
3120
3121


# Obtain metadata

In [25]:
def loadJsonFile(path):
    if path[-5:] == '.json':
        return json.load(open(path))

path_ontology = "/nfs/students/summer-term-2020/project-4/data/audioset/ontology/ontology.json"
ontology = loadJsonFile(path_ontology)

In [26]:
ontology_id_name_mapping = {}

for entry in ontology:
    ontology_id_name_mapping[entry['id']] = entry['name']

In [27]:
def loadMetaFiles(paths):
    meta_files = []
    for path in paths:
        (dirpath, dirnames, filenames) = next(os.walk(path))
        for filename in filenames:
            if filename != 'meta.csv':
                continue
            meta_files.append(pd.read_csv(dirpath + filename, index_col=0))
        
    return pd.concat(meta_files, axis=0)

In [28]:
df_meta = loadMetaFiles(positive_class_paths + negative_class_paths)

In [29]:
df_meta.head(1)

Unnamed: 0,start_seconds,end_seconds,positive_labels
j2SKCBQGPXo,30.0,40.0,"""/m/026t6,/m/04rlf,/m/04szw,/m/0cfdd"""


In [30]:
ontology_id_name_mapping['/m/026t6']

'Drum'

In [31]:
def createFinalDataset(datasetFiles):
    final_set = []
    
    for filePath in datasetFiles:
        
        # (...)/dataset1/download/training_unbalanced/negative/9kHMnPosPzw.wav'
        name = filePath.split('/')[-1][:-4]
        classification = filePath.split('/')[-2]
        source = filePath.split('/')[-3]
        
        # see samples of df_meta and ontology_id_name_mapping above
        labels = df_meta.loc[name]['positive_labels'][1:-1].split(',')
        label_names = [ontology_id_name_mapping[label] for label in labels]
        
        sample = {'youtube_id': name,
                  'start_seconds': df_meta.loc[name]['start_seconds'],
                  'end_seconds': df_meta.loc[name]['end_seconds'],
                  'binary_class': classification,
                  'labels': labels,
                  'label_names': label_names,
                  'source': source, 
                  'path': filePath}
        
        final_set.append(sample)
        
    return final_set

In [32]:
train = createFinalDataset(train_paths)
valid = createFinalDataset(valid_paths)
test = createFinalDataset(test_paths)

In [33]:
# negative sample
train[0]

{'youtube_id': 'NnDaIk0xIMU',
 'start_seconds': 10.0,
 'end_seconds': 20.0,
 'binary_class': 'negative',
 'labels': ['/m/034srq',
  '/m/03m9d0z',
  '/m/05kq4',
  '/m/06mb1',
  '/m/0jb2l',
  '/m/0ngt1',
  '/t/dd00038',
  '/t/dd00092'],
 'label_names': ['Waves, surf',
  'Wind',
  'Ocean',
  'Rain',
  'Thunderstorm',
  'Thunder',
  'Rain on surface',
  'Wind noise (microphone)'],
 'source': 'training_unbalanced',
 'path': '/nfs/students/summer-term-2020/project-4/data/dataset2/download/training_unbalanced/negative/NnDaIk0xIMU.wav'}

In [37]:
# positive sample
train[4]

{'youtube_id': 'X1ifV2yQLOg',
 'start_seconds': 270.0,
 'end_seconds': 280.0,
 'binary_class': 'positive',
 'labels': ['/m/02qldy', '/m/04rlf', '/m/06j64v', '/m/09x0r'],
 'label_names': ['Narration, monologue',
  'Music',
  'Middle Eastern music',
  'Speech'],
 'source': 'eval_balanced',
 'path': '/nfs/students/summer-term-2020/project-4/data/dataset2/download/eval_balanced/positive/X1ifV2yQLOg.wav'}

# Save dataset

In [38]:
path_training_paths = "/nfs/students/summer-term-2020/project-4/data/dataset2/finalDataset/training.json"
path_validation_paths = "/nfs/students/summer-term-2020/project-4/data/dataset2/finalDataset/validation.json"
path_testing_paths = "/nfs/students/summer-term-2020/project-4/data/dataset2/finalDataset/testing.json"

In [39]:
def saveDataset(dataset, path):
    f = open(path, "w")
    f.write(json.dumps(dataset))
    f.close()

In [40]:
saveDataset(train, path_training_paths)
saveDataset(valid, path_validation_paths)
saveDataset(test,  path_testing_paths)

# Create resampled version with fixed sample rate

Note: manual work involved - manually adjust files/paths, sample rate, etc

In [1]:
FIXED_SAMPLE_RATE = 8000

In [2]:
DATA_PATH = "/nfs/students/summer-term-2020/project-4/data/dataset2/finalDataset/"
DATA_FILES = ["training.json", "validation.json", "testing.json"]

In [5]:
def getJSON(path):
    with open(path ) as f:
        d = json.load(f)
        return d
    
training, validation, testing = (getJSON(DATA_PATH + DATA_FILES[i]) for i in range(3))

In [None]:
for sample in tqdm(training):
    sample['data'] = (torch.tensor(librosa.load(sample['path'], sr=FIXED_SAMPLE_RATE)[0]), FIXED_SAMPLE_RATE)

for sample in tqdm(validation):
    sample['data'] = (torch.tensor(librosa.load(sample['path'], sr=FIXED_SAMPLE_RATE)[0]), FIXED_SAMPLE_RATE)

for sample in tqdm(testing):
    sample['data'] = (torch.tensor(librosa.load(sample['path'], sr=FIXED_SAMPLE_RATE)[0]), FIXED_SAMPLE_RATE)

100%|██████████| 9361/9361 [1:00:46<00:00,  2.57it/s]
100%|██████████| 3120/3120 [20:03<00:00,  2.59it/s]
  3%|▎         | 100/3121 [00:43<18:28,  2.73it/s]

In [None]:
pickle_path = "/nfs/students/summer-term-2020/project-4/data/dataset2/dataset_8k/"
torch.save(training, open(pickle_path + "training.pt","wb"))
torch.save(validation, open(pickle_path + "validation.pt","wb"))
torch.save(testing, open(pickle_path + "testing.pt","wb"))