# Preample

In [1]:
import json
import pandas as pd
import numpy as np
import youtube_dl
import os
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
from multiprocessing import Process
import multiprocessing

# Load data

In [2]:
# paths
audioset_path = "/nfs/students/summer-term-2020/project-4/data/audioset/"
path_ontology = audioset_path + "ontology/ontology.json"
path_train_unbalanced = audioset_path + "audioSetCsv/unbalanced_train_segments.csv"
path_train_balanced =  audioset_path + "audioSetCsv/balanced_train_segments.csv"
path_eval_balanced =  audioset_path + "audioSetCsv/eval_segments.csv"

In [3]:
def loadJsonFile(path):
    if path[-5:] == '.json':
        return json.load(open(path))
    
def loadCsvFile(path):
    if path[-4:] == '.csv':
        names = ['YTID', 'start_seconds', 'end_seconds', 'positive_labels']
        return pd.read_csv(open(path), sep=', ', header=None, index_col=0, 
                           skiprows=3, names=names, engine='python')

In [4]:
ontoloy = loadJsonFile(path_ontology)
train_unbalanced = loadCsvFile(path_train_unbalanced)
train_balanced = loadCsvFile(path_train_balanced)
eval_balanced = loadCsvFile(path_eval_balanced)

# Analyzing ontology

## Find entry

In [5]:
for entry in ontoloy:
    if 'Music' in entry['name']:
        print("\t " + entry['name'] + " (" + entry['id'] + ")")

	 Music (/m/04rlf)
	 Musical instrument (/m/04szw)
	 Musical ensemble (/m/05229)
	 Music genre (/m/0kpv1t)
	 Music of Latin America (/m/0g293)
	 Music for children (/m/05fw6t)
	 Music of Africa (/m/0164x2)
	 Music of Asia (/m/028sqc)
	 Music of Bollywood (/m/0dq0md)
	 Musical concepts (/t/dd00027)
	 Musical note (/m/05jcn)
	 Music role (/t/dd00028)
	 Music mood (/t/dd00030)


## Analyze ontology towards Emergency Vehicle dataset

In [6]:
def findChildren(ontology_id):
    result = []
    for entry in ontoloy:
        if entry['id'] == ontology_id:
            result.append(ontology_id)
            for child_id in entry['child_ids']:
                result.extend(findChildren(child_id))
    return result

In [7]:
positive_ids = []
negative1_ids = []

# https://research.google.com/audioset/ontology/music_1.html
positive_ids.extend(findChildren('/m/04rlf'))

# https://research.google.com/audioset/ontology/natural_sounds_1.html
negative1_ids.extend([x for x in findChildren('/m/059j3w') if x not in positive_ids])

In [8]:
def printChildren(ontology_id, depth, list_of_ids):
    for entry in ontoloy:
        if entry['id'] == ontology_id and ontology_id in list_of_ids: 
            print(''.join(["\t" for _ in range(depth)]) + entry['name'] + " (" + entry['id'] + ")")
            for child_id in entry['child_ids']:
                printChildren(child_id, depth+1, list_of_ids)

In [9]:
print("Positive classes:")
# https://research.google.com/audioset/ontology/music_1.html
printChildren('/m/04rlf', 1, positive_ids)
    
print()
print("Negative classes 1:")
# https://research.google.com/audioset/ontology/natural_sounds_1.html
printChildren('/m/059j3w', 1, negative1_ids)

Positive classes:
	Music (/m/04rlf)
		Musical instrument (/m/04szw)
			Plucked string instrument (/m/0fx80y)
				Guitar (/m/0342h)
					Electric guitar (/m/02sgy)
					Bass guitar (/m/018vs)
					Acoustic guitar (/m/042v_gx)
					Steel guitar, slide guitar (/m/06w87)
					Tapping (guitar technique) (/m/01glhc)
					Strum (/m/07s0s5r)
				Banjo (/m/018j2)
				Sitar (/m/0jtg0)
				Mandolin (/m/04rzd)
				Zither (/m/01bns_)
				Ukulele (/m/07xzm)
			Keyboard (musical) (/m/05148p4)
				Piano (/m/05r5c)
					Electric piano (/m/01s0ps)
						Clavinet (/m/025cbm)
						Rhodes piano (/m/0bxl5)
				Organ (/m/013y1f)
					Electronic organ (/m/03xq_f)
					Hammond organ (/m/03gvt)
				Synthesizer (/m/0l14qv)
					Sampler (/m/01v1d8)
					Mellotron (/m/0gkd1)
				Harpsichord (/m/03q5t)
			Percussion (/m/0l14md)
				Drum kit (/m/02hnl)
					Drum machine (/m/0cfdd)
				Drum (/m/026t6)
					Snare drum (/m/06rvn)
						Rimshot (/m/03t3fj)
						Drum roll (/m/02k_mr)
					Bass drum (/m/0bm02)
					Timp

# Creating dataset

In [10]:
negative_ids = negative1_ids

In [11]:
def binaryClassification(labels):
    containsPositiveLabel = False
    containsNegativeLabel = False
    
    for label in labels:
        if label in positive_ids:
            containsPositiveLabel = True
    
        if label in negative_ids:
            containsNegativeLabel = True
            
    return containsPositiveLabel, containsNegativeLabel

In [12]:
def isPositiveSample(labels):
    containsPositiveLabel, containsNegativeLabel = binaryClassification(labels)
    # don't exclude samples with negative labels, 
    # as EM sounds sometimes labeled as siren as well (we deal with multi-labeling here)
    return containsPositiveLabel

In [13]:
def isNegativeSample(labels):
    containsPositiveLabel, containsNegativeLabel = binaryClassification(labels)
    return not containsPositiveLabel and containsNegativeLabel

In [14]:
def getBinaryDatasetSamples(dataset):
    positives = pd.DataFrame(data=None, columns=dataset.columns)
    negatives = pd.DataFrame(data=None, columns=dataset.columns)

    for index, row in dataset.iterrows():
        labels = row['positive_labels'].split(',')

        isPositive = isPositiveSample(labels)
        isNegative = isNegativeSample(labels)

        if isPositive and not isNegative:
            positives.loc[index]=row
        elif not isPositive and isNegative:
            negatives.loc[index]=row
            
    return positives, negatives

In [None]:
train_unbalanced_positives, train_unbalanced_negatives = getBinaryDatasetSamples(train_unbalanced)
train_balanced_positives, train_balanced_negatives = getBinaryDatasetSamples(train_balanced)
eval_balanced_positives, eval_balanced_negatives = getBinaryDatasetSamples(eval_balanced)

In [None]:
print(len(train_unbalanced_negatives))
print(len(train_unbalanced_positives))
print()
print(len(train_balanced_negatives))
print(len(train_balanced_positives))
print()
print(len(eval_balanced_negatives))
print(len(eval_balanced_positives))

In [None]:
# restrict positives as we have too many/sample from positives [downloading would take too long/too much space]
train_unbalanced_positives = train_unbalanced_positives.sample(8000, random_state=42)

In [None]:
print(len(train_unbalanced_negatives))
print(len(train_unbalanced_positives))
print()
print(len(train_balanced_negatives))
print(len(train_balanced_positives))
print()
print(len(eval_balanced_negatives))
print(len(eval_balanced_positives))

# Downloading

In [None]:
ydl_opts = {
    'quiet':'True',
    'format': 'bestaudio/best',
    'ignoreerrors':'True',
    'no_warnings':'True',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
        'preferredquality': '192',
    }],
    'outtmpl':'tmp/%(id)s.%(ext)s',
    }

In [None]:
def downloadDataset(dataset, save_path):
    for index, row in dataset.iterrows():
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download(['http://www.youtube.com/watch?v=' + index])

        name = index + '.wav'
        path_downloaded = './tmp/' + name
        path_final = save_path + name
        start = row[0]
        end = row[1]
        
        if os.path.exists(path_downloaded):
            ffmpeg_extract_subclip(path_downloaded, start, end,targetname=path_final)
            os.remove(path_downloaded)

In [None]:
def downloadDatasetParallelized(dataset, save_path):
    num_cpus = multiprocessing.cpu_count()
    listOfDfs = [dataset.loc[idx] for idx in np.array_split(dataset.index,num_cpus)]
    print("Mean chunk size: " + str(np.mean([len(x) for x in listOfDfs])))
    
    processes = []
    for chunk in listOfDfs:
        p = Process(target=downloadDataset, args=(chunk,save_path))
        processes.append(p)
        p.start()
        
    for p in processes:
        p.join()
        
    dataset.to_csv(save_path + "meta.csv")

In [None]:
path_train_unbalanced_negatives = "/nfs/students/summer-term-2020/project-4/data/dataset2/download/training_unbalanced/negative/"
path_train_unbalanced_positives = "/nfs/students/summer-term-2020/project-4/data/dataset2/download/training_unbalanced/positive/"

path_train_balanced_negatives = "/nfs/students/summer-term-2020/project-4/data/dataset2/download/training_balanced/negative/"
path_train_balanced_positives = "/nfs/students/summer-term-2020/project-4/data/dataset2/download/training_balanced/positive/"

path_eval_balanced_negatives = "/nfs/students/summer-term-2020/project-4/data/dataset2/download/eval_balanced/negative/"
path_eval_balanced_positives = "/nfs/students/summer-term-2020/project-4/data/dataset2/download/eval_balanced/positive/"

In [None]:
with open("/nfs/homedirs/scholten/results.txt", "w") as f:
    f.write("START\n")
    
downloadDatasetParallelized(train_unbalanced_negatives, path_train_unbalanced_negatives)
downloadDatasetParallelized(train_unbalanced_positives, path_train_unbalanced_positives)

downloadDatasetParallelized(train_balanced_negatives, path_train_balanced_negatives)
downloadDatasetParallelized(train_balanced_positives, path_train_balanced_positives)

downloadDatasetParallelized(eval_balanced_negatives, path_eval_balanced_negatives)
downloadDatasetParallelized(eval_balanced_positives, path_eval_balanced_positives)

with open("/nfs/homedirs/scholten/results.txt", "a") as f:
    f.write("DONE\n")