We will analyze https://research.google.com/audioset///dataset/emergency_vehicle.html

# Preample

In [1]:
import json
import pandas as pd
import numpy as np
import os
import librosa
import IPython.display as ipd
import random
from sklearn.model_selection import train_test_split
import multiprocessing as mp
from tqdm import tqdm
from scipy.io import wavfile

In [2]:
# reproducibility
np.random.seed(1337)
random.seed(42)

# Create interpolation dataset

In [12]:
path_data = "/nfs/students/summer-term-2020/project-4/data/interpolation_data/download/"

In [14]:
def mergeSources(paths):
    class_filenames = []
    class_paths=[]

    for path in paths:
        (dirpath, dirnames, filenames) = next(os.walk(path))

        for filename in filenames:
            if filename == 'meta.csv':
                continue
                
            if filename not in class_filenames:
                class_filenames.append(filename)
                class_paths.append(dirpath + filename)
                
    return class_paths

In [15]:
sound_files = mergeSources([path_data])

In [16]:
print(len(sound_files))

1025


In [19]:
# remove bad samples (we obsereved some files with zero length or only zeros as content)
def badSample(path):
    try:
        _, data = wavfile.read(path)
    except:
        return False
    all_elements_zero = len(np.nonzero(data)[0]) == 0
    zero_length = len(data) == 0
    return all_elements_zero or zero_length

In [20]:
def getGoodNegatives(negatives):
    good_negatives = []

    for sample in tqdm(negatives):
        if not badSample(sample):
            good_negatives.append(sample)
    return good_negatives

In [21]:
quality_checked_sounds = getGoodNegatives(sound_files)

100%|██████████| 1025/1025 [00:09<00:00, 111.41it/s]


In [22]:
print(len(quality_checked_sounds))

1025


# Obtain metadata

In [23]:
def loadJsonFile(path):
    if path[-5:] == '.json':
        return json.load(open(path))

path_ontology = "/nfs/students/summer-term-2020/project-4/data/dataset1/ontology/ontology.json"
ontology = loadJsonFile(path_ontology)

In [24]:
ontology_id_name_mapping = {}

for entry in ontology:
    ontology_id_name_mapping[entry['id']] = entry['name']

In [25]:
def loadMetaFiles(paths):
    meta_files = []
    for path in paths:
        (dirpath, dirnames, filenames) = next(os.walk(path))
        for filename in filenames:
            if filename != 'meta.csv':
                continue
            meta_files.append(pd.read_csv(dirpath + filename, index_col=0))
        
    return pd.concat(meta_files, axis=0)

In [26]:
df_meta = loadMetaFiles(['/nfs/students/summer-term-2020/project-4/data/interpolation_data/download/'])

In [27]:
df_meta.head(1)

Unnamed: 0,start_seconds,end_seconds,positive_labels,super_category
1_iz3BvTJug,30.0,40.0,"""/m/04rlf,/m/07s8j8t,/m/07sbbz2,/m/09x0r""",/m/04rlf


In [11]:
ontology_id_name_mapping['/m/012ndj']

'Fire engine, fire truck (siren)'

# Sort invalid files (those vailed to download)

In [178]:
# ensure all files are actually downloaded (sometimes download may fail due to unavailable/blocked videos etc)
def getName(path):
    return path.split("/")[-1].replace('.wav', '')
names = [getName(path) for path in quality_checked_sounds]

not_downloaded = []
for index, row in df_meta.iterrows():
    if index not in names:
        not_downloaded.append(index)
print("Failed to download: " + str(len(not_downloaded)))

df_meta_valid = df_meta.drop(not_downloaded)

Failed to download: 122


# Remove those that are in valid/test

In [214]:
path_validation_paths = "/nfs/students/summer-term-2020/project-4/data/dataset1/finalDataset/validation.json"
path_testing_paths = "/nfs/students/summer-term-2020/project-4/data/dataset1/finalDataset/testing.json"

In [227]:
validTestData = json.loads(open(path_validation_paths).read()) + json.loads(open(path_testing_paths).read())
validTestNames = [x['path'].split("/")[-1].replace(".wav", "") for x in validTestData]

In [230]:
notValid = []
for index, row in df_meta_valid.iterrows():
    if index in validTestNames:
        notValid.append(index)
print("Not valid: " + str(len(notValid)))

df_meta_valid = df_meta_valid.drop(notValid)

Not valid: 28


# Create dataset

In [235]:
final_dataset_meta = df_meta_valid.groupby('super_category', as_index=False).apply(lambda x: x.sample(100, random_state=random_num))

In [236]:
final_dataset_meta.head()

Unnamed: 0,Unnamed: 1,start_seconds,end_seconds,positive_labels,super_category
0,6cX2K0UTgjU,540.0,550.0,"""/m/05x_td,/m/07qv_d5,/m/0912c9,/m/09x0r""",/m/012f08
0,cERFUmgIQS0,30.0,40.0,"""/m/02rhddq,/m/07r04,/m/07yv9""",/m/012f08
0,J6rKfrIq9Ds,100.0,110.0,"""/m/07q2z82,/m/07yv9,/m/0k4j,/t/dd00066,/t/dd0...",/m/012f08
0,SE1ik9fxks4,90.0,100.0,"""/m/07q2z82,/m/07yv9,/m/0btp2,/m/0k4j,/m/0ltv""",/m/012f08
0,bJ424-lPwV4,120.0,130.0,"""/m/07yv9,/m/0h9mv,/m/0k4j""",/m/012f08


In [238]:
final_dataset_meta.shape

(600, 4)

In [239]:
final_dataset_meta.loc[(0, 'gH9or2aQqOg')]

start_seconds                                         300
end_seconds                                           310
positive_labels    "/m/012f08,/m/07r04,/m/07yv9,/m/0btp2"
super_category                                  /m/012f08
Name: (0, gH9or2aQqOg), dtype: object

In [240]:
def findPath(name):
    for path in quality_checked_sounds:
        if name in path:
            return path

In [241]:
final_dataset = []

for index, row in final_dataset_meta.iterrows():
    name = index[1]
    filePath = findPath(name)
    
    labels = final_dataset_meta.loc[index]['positive_labels'][1:-1].split(',')
    label_names = [ontology_id_name_mapping[label] for label in labels]
    
    sample = {'youtube_id': name,
              'start_seconds': final_dataset_meta.loc[index]['start_seconds'],
              'end_seconds': final_dataset_meta.loc[index]['end_seconds'],
              'category': ontology_id_name_mapping[final_dataset_meta.loc[index]['super_category']],
              'labels': labels,
               'label_names': label_names,
               'path': filePath}
    
    final_dataset.append(sample)

In [242]:
# sample
final_dataset[-1]

{'youtube_id': '_3RHrZDarB4',
 'start_seconds': 30.0,
 'end_seconds': 40.0,
 'category': 'Animal',
 'labels': ['/m/068hy', '/m/07qf0zm', '/m/0bt9lr'],
 'label_names': ['Domestic animals, pets', 'Howl', 'Dog'],
 'path': '/nfs/students/summer-term-2020/project-4/data/interpolation_data/download/_3RHrZDarB4.wav'}

In [243]:
len(final_dataset)

600

# Save dataset

In [244]:
def saveDataset(dataset, path):
    f = open(path, "w")
    f.write(json.dumps(dataset))
    f.close()

In [245]:
interpolation_data_path= "/nfs/students/summer-term-2020/project-4/data/interpolation_data/interpolation_data.json"

In [246]:
saveDataset(final_dataset, interpolation_data_path)

# Create resampled versions with fixed sample rates

In [252]:
import copy
import pickle

In [248]:
dataset_48k = copy.deepcopy(final_dataset)
dataset_8k = copy.deepcopy(final_dataset)

In [249]:
for sample in dataset_48k:
    sample['data'] = librosa.load(sample['path'], sr=48000)

In [250]:
for sample in dataset_8k:
    sample['data'] = librosa.load(sample['path'], sr=8000)

In [253]:
pickle_path = "/nfs/students/summer-term-2020/project-4/data/interpolation_data/data_48k/"
pickle.dump(dataset_48k, open(pickle_path + "interpolation_data_downsampled.p","wb"))

In [254]:
pickle_path = "/nfs/students/summer-term-2020/project-4/data/interpolation_data/data_8k/"
pickle.dump(dataset_8k, open(pickle_path + "interpolation_data_downsampled.p","wb"))