We will analyze https://research.google.com/audioset///dataset/emergency_vehicle.html

# Preample

In [1]:
import json
import pandas as pd
import numpy as np
import youtube_dl
import os
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from multiprocessing import Process
import multiprocessing

# Load data

In [2]:
# paths

path_ontology = "/nfs/students/summer-term-2020/project-4/data/dataset1/ontology/ontology.json"
path_train_unbalanced = "/nfs/students/summer-term-2020/project-4/data/dataset1/audioSetCsv/unbalanced_train_segments.csv"
path_train_balanced = "/nfs/students/summer-term-2020/project-4/data/dataset1/audioSetCsv/balanced_train_segments.csv"
path_eval_balanced = "/nfs/students/summer-term-2020/project-4/data/dataset1/audioSetCsv/eval_segments.csv"

In [3]:
def loadJsonFile(path):
    if path[-5:] == '.json':
        return json.load(open(path))
    
def loadCsvFile(path):
    if path[-4:] == '.csv':
        names = ['YTID', 'start_seconds', 'end_seconds', 'positive_labels']
        return pd.read_csv(open(path), sep=', ', header=None, index_col=0, 
                           skiprows=3, names=names, engine='python')

In [4]:
ontoloy = loadJsonFile(path_ontology)
train_unbalanced = loadCsvFile(path_train_unbalanced)
train_balanced = loadCsvFile(path_train_balanced)
eval_balanced = loadCsvFile(path_eval_balanced)

# Analyzing ontology

## Find entry

In [5]:
for entry in ontoloy:
    if 'Motor vehicle' in entry['name']:
        print("\t " + entry['name'] + " (" + entry['id'] + ")")

	 Motor vehicle (road) (/m/012f08)


## Analyze ontology towards Emergency Vehicle dataset

In [6]:
def findChildren(ontology_id):
    result = []
    for entry in ontoloy:
        if entry['id'] == ontology_id:
            result.append(ontology_id)
            for child_id in entry['child_ids']:
                result.extend(findChildren(child_id))
    return result

In [7]:
positive_ids = []
negative1_ids = []
negative2_ids = []

# https://research.google.com/audioset///ontology/emergency_vehicle_1.html
positive_ids.extend(findChildren('/m/03j1ly')) 

# https://research.google.com/audioset///ontology/alarm_1.html
negative1_ids.extend([x for x in findChildren('/m/07pp_mv') if x not in positive_ids])

# https://research.google.com/audioset////ontology/motor_vehicle_road_1.html
negative2_ids.extend([x for x in findChildren('/m/012f08') if x not in positive_ids]) 

In [8]:
def printChildren(ontology_id, depth, list_of_ids):
    for entry in ontoloy:
        if entry['id'] == ontology_id and ontology_id in list_of_ids: 
            print(''.join(["\t" for _ in range(depth)]) + entry['name'] + " (" + entry['id'] + ")")
            for child_id in entry['child_ids']:
                printChildren(child_id, depth+1, list_of_ids)

In [9]:
print("Positive classes:")
# https://research.google.com/audioset///ontology/emergency_vehicle_1.html
printChildren('/m/03j1ly', 1, positive_ids)
    
print()
print("Negative classes 1:")
# https://research.google.com/audioset///ontology/alarm_1.html
printChildren('/m/07pp_mv', 1, negative1_ids)

print()
print("Negative classes 2:")
# https://research.google.com/audioset///ontology/alarm_1.html
printChildren('/m/012f08', 1, negative2_ids)

Positive classes:
	Emergency vehicle (/m/03j1ly)
		Police car (siren) (/m/04qvtq)
		Ambulance (siren) (/m/012n7d)
		Fire engine, fire truck (siren) (/m/012ndj)

Negative classes 1:
	Alarm (/m/07pp_mv)
		Telephone (/m/07cx4)
			Telephone bell ringing (/m/07pp8cl)
			Ringtone (/m/01hnzm)
			Cellphone buzz, vibrating alert (/m/01sb50)
			Telephone dialing, DTMF (/m/02c8p)
			Dial tone (/m/015jpf)
			Busy signal (/m/01z47d)
		Alarm clock (/m/046dlr)
		Siren (/m/03kmc9)
			Civil defense siren (/m/0dgbq)
		Doorbell (/m/03wwcy)
			Ding-dong (/m/07r67yg)
		Buzzer (/m/030rvx)
		Smoke detector, smoke alarm (/m/01y3hg)
		Fire alarm (/m/0c3f7m)
		Car alarm (/m/02mfyn)
		Vehicle horn, car horn, honking (/m/0912c9)
			Toot (/m/07qv_d5)
		Bicycle bell (/m/0gy1t2s)
		Air horn, truck horn (/m/05x_td)
		Foghorn (/m/04fq5q)
		Whistle (/m/0l156k)
			Kettle whistle (/g/11b630rrvh)
			Steam whistle (/m/06hck5)

Negative classes 2:
	Motor vehicle (road) (/m/012f08)
		Car (/m/0k4j)
			Vehicle horn, car horn, 

# Creating dataset

In [10]:
negative_ids = negative1_ids + negative2_ids

In [11]:
def binaryClassification(labels):
    containsPositiveLabel = False
    containsNegativeLabel = False
    
    for label in labels:
        if label in positive_ids:
            containsPositiveLabel = True
    
        if label in negative_ids:
            containsNegativeLabel = True
            
    return containsPositiveLabel, containsNegativeLabel

In [12]:
def isPositiveSample(labels):
    containsPositiveLabel, containsNegativeLabel = binaryClassification(labels)
    # don't exclude samples with negative labels, 
    # as EM sounds sometimes labeled as siren as well (we deal with multi-labeling here)
    return containsPositiveLabel

In [13]:
def isNegativeSample(labels):
    containsPositiveLabel, containsNegativeLabel = binaryClassification(labels)
    return not containsPositiveLabel and containsNegativeLabel

In [14]:
def getBinaryDatasetSamples(dataset):
    positives = pd.DataFrame(data=None, columns=dataset.columns)
    negatives = pd.DataFrame(data=None, columns=dataset.columns)

    for index, row in dataset.iterrows():
        labels = row['positive_labels'].split(',')

        isPositive = isPositiveSample(labels)
        isNegative = isNegativeSample(labels)

        if isPositive and not isNegative:
            positives.loc[index]=row
        elif not isPositive and isNegative:
            negatives.loc[index]=row
            
    return positives, negatives

In [15]:
train_unbalanced_positives, train_unbalanced_negatives = getBinaryDatasetSamples(train_unbalanced)
train_balanced_positives, train_balanced_negatives = getBinaryDatasetSamples(train_balanced)
eval_balanced_positives, eval_balanced_negatives = getBinaryDatasetSamples(eval_balanced)

In [16]:
print(len(train_unbalanced_negatives))
print(len(train_unbalanced_positives))
print()
print(len(train_balanced_negatives))
print(len(train_balanced_positives))
print()
print(len(eval_balanced_negatives))
print(len(eval_balanced_positives))

23852
4257

476
121

512
128


# Downloading

In [17]:
ydl_opts = {
    'quiet':'True',
    'format': 'bestaudio/best',
    'ignoreerrors':'True',
    'no_warnings':'True',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
        'preferredquality': '192',
    }],
    'outtmpl':'tmp/%(id)s.%(ext)s',
    }

In [18]:
def downloadDataset(dataset, save_path):
    for index, row in dataset.iterrows():
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download(['http://www.youtube.com/watch?v=' + index])

        name = index + '.wav'
        path_downloaded = './tmp/' + name
        path_final = save_path + name
        start = row[0]
        end = row[1]
        
        if os.path.exists(path_downloaded):
            ffmpeg_extract_subclip(path_downloaded, start, end,targetname=path_final)
            os.remove(path_downloaded)

In [19]:
def downloadDatasetParallelized(dataset, save_path):
    num_cpus = multiprocessing.cpu_count()
    listOfDfs = [dataset.loc[idx] for idx in np.array_split(dataset.index,num_cpus)]
    print("Mean chunk size: " + str(np.mean([len(x) for x in listOfDfs])))
    
    processes = []
    for chunk in listOfDfs:
        p = Process(target=downloadDataset, args=(chunk,save_path))
        processes.append(p)
        p.start()
        
    for p in processes:
        p.join()
        
    dataset.to_csv(save_path + "meta.csv")

In [20]:
path_train_unbalanced_negatives = "/nfs/students/summer-term-2020/project-4/data/dataset1/download/training_unbalanced/negative/"
path_train_unbalanced_positives = "/nfs/students/summer-term-2020/project-4/data/dataset1/download/training_unbalanced/positive/"

path_train_balanced_negatives = "/nfs/students/summer-term-2020/project-4/data/dataset1/download/training_balanced/negative/"
path_train_balanced_positives = "/nfs/students/summer-term-2020/project-4/data/dataset1/download/training_balanced/positive/"

path_eval_balanced_negatives = "/nfs/students/summer-term-2020/project-4/data/dataset1/download/eval_balanced/negative/"
path_eval_balanced_positives = "/nfs/students/summer-term-2020/project-4/data/dataset1/download/eval_balanced/positive/"

In [None]:
downloadDatasetParallelized(train_unbalanced_negatives, path_train_unbalanced_negatives)
downloadDatasetParallelized(train_unbalanced_positives, path_train_unbalanced_positives)

downloadDatasetParallelized(train_balanced_negatives, path_train_balanced_negatives)
downloadDatasetParallelized(train_balanced_positives, path_train_balanced_positives)

downloadDatasetParallelized(eval_balanced_negatives, path_eval_balanced_negatives)
downloadDatasetParallelized(eval_balanced_positives, path_eval_balanced_positives)