In [2]:
import concurrent.futures
import json
import pathlib
import subprocess

import pandas as pd
import requests

# need ffmpeg and youtube-dl installed

In [3]:
DATA_DIR = pathlib.Path("data")
DATA_DIR.mkdir(exist_ok=True)

In [4]:
ONTOLOGY_PATH = DATA_DIR / "ontology.json"
GOOGLE_AUDIO_DATASETS = [DATA_DIR / x for x in ("eval_segments.csv", "balanced_train_segments.csv", "unbalanced_train_segments.csv")]

In [5]:
CHAINSAW_AUDIO_DIR = DATA_DIR / "chainsaw"
CHAINSAW_AUDIO_DIR.mkdir(exist_ok=True)

In [6]:
# download ontology
if not ONTOLOGY_PATH.exists():
    ONTOLOGY_PATH.write_text(requests.get("https://github.com/audioset/ontology/raw/master/ontology.json").text)

In [7]:
ontology = json.loads(ONTOLOGY_PATH.read_text())

In [8]:
# download dataset from https://research.google.com/audioset/download.html
for dataset in GOOGLE_AUDIO_DATASETS:
    if not dataset.exists():
        dataset.write_text(requests.get(f"http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/{dataset.name}").text)

In [9]:
# read in all the data
df = pd.concat(pd.read_csv(dataset, skipinitialspace=True, header=2) for dataset in GOOGLE_AUDIO_DATASETS).rename(columns={"# YTID": "ytid"})
df

Unnamed: 0,ytid,start_seconds,end_seconds,positive_labels
0,--4gqARaEJE,0.0,10.0,"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"
1,--BfvyPmVMo,20.0,30.0,/m/03l9g
2,--U7joUcTCo,0.0,10.0,/m/01b_21
3,--i-y1v8Hy8,0.0,9.0,"/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005"
4,-0BIyqJj9ZU,30.0,40.0,"/m/07rgt08,/m/07sq110,/t/dd00001"
...,...,...,...,...
2041784,zzyyleHsxfk,20.0,30.0,"/m/05tny_,/m/068hy,/m/0bt9lr,/m/0jbk"
2041785,zzz-JsGPtxQ,120.0,130.0,"/m/015lz1,/m/0l14jd"
2041786,zzz3PZXRQ_8,30.0,40.0,"/m/030rvx,/m/09x0r"
2041787,zzznDcamMpw,0.0,10.0,"/m/09ddx,/m/09x0r"


In [10]:
# split and explode the labels
df["positive_labels"] = df["positive_labels"].apply(lambda x: x.split(","))
df = df.explode("positive_labels", ignore_index=True)
df

Unnamed: 0,ytid,start_seconds,end_seconds,positive_labels
0,--4gqARaEJE,0.0,10.0,/m/068hy
1,--4gqARaEJE,0.0,10.0,/m/07q6cd_
2,--4gqARaEJE,0.0,10.0,/m/0bt9lr
3,--4gqARaEJE,0.0,10.0,/m/0jbk
4,--BfvyPmVMo,20.0,30.0,/m/03l9g
...,...,...,...,...
4124893,zzznDcamMpw,0.0,10.0,/m/09ddx
4124894,zzznDcamMpw,0.0,10.0,/m/09x0r
4124895,zzztDPePp4g,410.0,420.0,/m/04rlf
4124896,zzztDPePp4g,410.0,420.0,/m/09x0r


In [11]:
df["duration"] = df["end_seconds"] - df["start_seconds"]
df

Unnamed: 0,ytid,start_seconds,end_seconds,positive_labels,duration
0,--4gqARaEJE,0.0,10.0,/m/068hy,10.0
1,--4gqARaEJE,0.0,10.0,/m/07q6cd_,10.0
2,--4gqARaEJE,0.0,10.0,/m/0bt9lr,10.0
3,--4gqARaEJE,0.0,10.0,/m/0jbk,10.0
4,--BfvyPmVMo,20.0,30.0,/m/03l9g,10.0
...,...,...,...,...,...
4124893,zzznDcamMpw,0.0,10.0,/m/09ddx,10.0
4124894,zzznDcamMpw,0.0,10.0,/m/09x0r,10.0
4124895,zzztDPePp4g,410.0,420.0,/m/04rlf,10.0
4124896,zzztDPePp4g,410.0,420.0,/m/09x0r,10.0


In [12]:
chainsaw_id = next(filter(lambda x: x["name"] == "Chainsaw", ontology))["id"]
chainsaw_id

'/m/01j4z9'

In [28]:
chainsaw_vidoes = df[df["positive_labels"] == chainsaw_id]
chainsaw_vidoes

Unnamed: 0,ytid,start_seconds,end_seconds,positive_labels,duration
75,-23CeprtibU,30.0,40.0,/m/01j4z9,10.0
1207,0DzsPL-xElE,20.0,30.0,/m/01j4z9,10.0
2246,1MgcrYdYas0,30.0,40.0,/m/01j4z9,10.0
2592,1fDGejVDAHU,0.0,10.0,/m/01j4z9,10.0
4033,3IwOig7sw6c,30.0,40.0,/m/01j4z9,10.0
...,...,...,...,...,...
4111176,zhFcXNK076k,0.0,10.0,/m/01j4z9,10.0
4113417,zkH4h1WjD6Q,130.0,140.0,/m/01j4z9,10.0
4122108,zwIArHIZbxU,30.0,40.0,/m/01j4z9,10.0
4124264,zzApe4GLGfY,10.0,20.0,/m/01j4z9,10.0


In [13]:
def download_yt_strip(d):
    """
    download youtube video's audio as mp3 from start_seconds for given duration
    """
    ytid, start_seconds, duration = d["ytid"], d["start_seconds"], d["duration"]
    outfile = CHAINSAW_AUDIO_DIR / f"{ytid}.mp3"
    if outfile.exists():
        print(f"skipping {ytid}")
        return
    res = subprocess.run(["youtube-dl", "--get-url", "--extract-audio", f"http://youtu.be/{ytid}"],
                         capture_output=True,
                         text=True)
    if res.returncode != 0:
        print(f"error with ytid {ytid}: {res.stderr}")
    else:
        url = res.stdout.strip()
        print(f"grabbing {ytid}")
        subprocess.run(["ffmpeg", "-ss", f"{start_seconds}", "-i", url, "-t", f"{duration}", "-c", "mp3", outfile])

In [15]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(download_yt_strip, chainsaw_vidoes.to_dict(orient="records"))

skipping 0DzsPL-xElE
skipping 1MgcrYdYas0
skipping 1fDGejVDAHU
skipping 3IwOig7sw6c
skipping 4egukLBpAcQskipping 7TmKzUgWiRUskipping 88N23-hjddY

skipping AhcY8QVSLtM
skipping BBukw6JpCeg
skipping 9HcahqYUVoc
skipping BsYFCAuPjwE
skipping C46X66FU_Dw
skipping CKkbZCb9Y18

skipping AahgXCrU1ZQ
skipping EcywputDZmE
skipping F8OkghIn9L8
skipping GFkB9Par4M8
skipping LH08k5Kf4AI
skipping CZQ1bedI5Wo
skipping Lx4poQw1mZo
skipping MnH4tzgKXVc
skipping NJUl3gPX07o
skipping OWuMapsvJ3w
skipping TIL8jEw8pSc
skipping C0j69NCIKfw
skipping UqyvFyQthHo
skipping Vsz6dpRCwSo
skipping W1VYWwYdJRQ
skipping WLPj0LIokkM
skipping UF0HDsK0fwo
skipping ZoameGbMVt8
skipping _6uZ1HyHSQY
skipping _dvVfWzJlDM
skipping an2mZpaPTpA
skipping brfKKehFtmw
skipping cReLETAjLHI
skipping ckzLHmauwI8
skipping ereEhTuBlwA
error with ytid 3lkI_scF2eU: ERROR: Video unavailable

error with ytid ApNcj8ijJlo: ERROR: Private video
Sign in if you've been granted access to this video

skipping i3HsITnarf4
skipping jUjEZr4O6fU
sk