In [1]:
import concurrent.futures
import json
import pathlib
import subprocess

import pandas as pd
import requests

# need ffmpeg and youtube-dl installed

from common import *

In [2]:
DATA_DIR.mkdir(exist_ok=True)

In [3]:
CHAINSAW_AUDIO_DIR.mkdir(exist_ok=True)

In [4]:
# download mid to name mapping
if not MID_TO_NAME_PATH.exists():
    MID_TO_NAME_PATH.write_text(requests.get("http://storage.googleapis.com/us_audioset/youtube_corpus/strong/mid_to_display_name.tsv").text)

In [5]:
# download dataset from https://research.google.com/audioset/download_strong.html
for dataset in GOOGLE_AUDIO_DATASETS:
    if not dataset.exists():
        dataset.write_text(requests.get(f"http://storage.googleapis.com/us_audioset/youtube_corpus/strong/{dataset.name}").text)

In [6]:
# read in all the data
df = pd.concat(pd.read_csv(dataset, sep="\t") for dataset in GOOGLE_AUDIO_DATASETS)
df

Unnamed: 0,segment_id,start_time_seconds,end_time_seconds,label
0,s9d-2nhuJCQ_30000,0.000,10.000,/m/04rlf
1,s9d-2nhuJCQ_30000,2.627,7.237,/m/053hz1
2,s9d-2nhuJCQ_30000,2.627,9.239,/m/03qtwd
3,s9d-2nhuJCQ_30000,5.634,6.649,/m/01w250
4,s9d-2nhuJCQ_30000,7.201,8.560,/m/0l15bq
...,...,...,...,...
934816,cq-vfngNXMc_70000,7.836,8.015,/m/07qjznt
934817,cq-vfngNXMc_70000,8.226,8.511,/t/dd00099
934818,cq-vfngNXMc_70000,8.503,8.868,/m/05zppz
934819,cq-vfngNXMc_70000,9.217,9.624,/t/dd00099


In [7]:
df[["ytid", "start_seconds"]] = df["segment_id"].str.rsplit("_", n=1, expand=True)
df["start_seconds"] = df["start_seconds"].astype(int) / 1000
df["duration"] = 10.0   # all clips of same length
df

Unnamed: 0,segment_id,start_time_seconds,end_time_seconds,label,ytid,start_seconds,duration
0,s9d-2nhuJCQ_30000,0.000,10.000,/m/04rlf,s9d-2nhuJCQ,30.0,10.0
1,s9d-2nhuJCQ_30000,2.627,7.237,/m/053hz1,s9d-2nhuJCQ,30.0,10.0
2,s9d-2nhuJCQ_30000,2.627,9.239,/m/03qtwd,s9d-2nhuJCQ,30.0,10.0
3,s9d-2nhuJCQ_30000,5.634,6.649,/m/01w250,s9d-2nhuJCQ,30.0,10.0
4,s9d-2nhuJCQ_30000,7.201,8.560,/m/0l15bq,s9d-2nhuJCQ,30.0,10.0
...,...,...,...,...,...,...,...
934816,cq-vfngNXMc_70000,7.836,8.015,/m/07qjznt,cq-vfngNXMc,70.0,10.0
934817,cq-vfngNXMc_70000,8.226,8.511,/t/dd00099,cq-vfngNXMc,70.0,10.0
934818,cq-vfngNXMc_70000,8.503,8.868,/m/05zppz,cq-vfngNXMc,70.0,10.0
934819,cq-vfngNXMc_70000,9.217,9.624,/t/dd00099,cq-vfngNXMc,70.0,10.0


In [8]:
chainsaw_id = pd.read_csv(MID_TO_NAME_PATH, sep="\t", names=["mid", "name"]).query("name == 'Chainsaw'")["mid"].iloc[0]
chainsaw_id

'/m/01j4z9'

In [9]:
chainsaw_videos = df[df["label"] == chainsaw_id].drop(columns=["start_time_seconds", "end_time_seconds"]).drop_duplicates()
chainsaw_videos

Unnamed: 0,segment_id,label,ytid,start_seconds,duration
4096,BBukw6JpCeg_70000,/m/01j4z9,BBukw6JpCeg,70.0,10.0
8904,KwwcpaxWT5E_30000,/m/01j4z9,KwwcpaxWT5E,30.0,10.0
9128,W1VYWwYdJRQ_370000,/m/01j4z9,W1VYWwYdJRQ,370.0,10.0
10865,CZQ1bedI5Wo_80000,/m/01j4z9,CZQ1bedI5Wo,80.0,10.0
12825,Vsz6dpRCwSo_380000,/m/01j4z9,Vsz6dpRCwSo,380.0,10.0
...,...,...,...,...,...
906217,nWbsJOxO-34_80000,/m/01j4z9,nWbsJOxO-34,80.0,10.0
921729,9qmCbAoykkQ_40000,/m/01j4z9,9qmCbAoykkQ,40.0,10.0
926011,93C94T6TfhA_0,/m/01j4z9,93C94T6TfhA,0.0,10.0
928667,VbpHhTTiEfk_30000,/m/01j4z9,VbpHhTTiEfk,30.0,10.0


In [10]:
def download_yt_strip(d):
    """
    download youtube video's audio as mp3 from start_seconds for given duration
    """
    name, ytid, start_seconds, duration = d["segment_id"], d["ytid"], d["start_seconds"], d["duration"]
    outfile = CHAINSAW_AUDIO_DIR / f"{name}.mp3"
    if outfile.exists():
        print(f"skipping {ytid}")
        return
    res = subprocess.run(["youtube-dl", "--get-url", "--extract-audio", f"http://youtu.be/{ytid}"],
                         capture_output=True,
                         text=True)
    if res.returncode != 0:
        print(f"error with ytid {ytid}: {res.stderr}")
    else:
        url = res.stdout.strip()
        print(f"grabbing {ytid}")
        subprocess.run(["ffmpeg", "-loglevel", "error", "-ss", f"{start_seconds}", "-i", url, "-t", f"{duration}", "-c", "mp3", outfile])

In [12]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(download_yt_strip, chainsaw_videos.to_dict(orient="records"))

skipping BBukw6JpCeg
skipping KwwcpaxWT5E
skipping CZQ1bedI5Wo
skipping W1VYWwYdJRQ
skipping KwLtyPOBSq0
skipping 9HcahqYUVoc
skipping rTMBmCR57l8
skipping Vsz6dpRCwSo
skipping CKkbZCb9Y18
skipping AnGW1f1p_4wskipping E32Is8KRTnk

skipping _dvVfWzJlDMskipping 3IwOig7sw6c

skipping whTmAkw51GU
skipping sCXkuo5_s4Mskipping tPJ9t3jIBjA

skipping e4qVs0kpC_wskipping i3HsITnarf4
skipping ereEhTuBlwA
skipping ZoameGbMVt8
skipping lXUcalwkWCo

skipping AhcY8QVSLtM
skipping j7Qw7cyp2z8
skipping EcywputDZmEskipping AahgXCrU1ZQ
skipping y35fVu--IFI
skipping rc39k_b8uhMskipping kHNWRR0hJ08
skipping TIL8jEw8pSc
skipping j5UuU306R4kskipping Do5DfkHBhtg

skipping v_qX6HAhxnE
skipping 6iDiC8UJeCI
skipping C0j69NCIKfw
skipping WLPj0LIokkM
skipping 1MgcrYdYas0
skipping jUjEZr4O6fU

skipping 2Mb6w4q_Gb0
skipping GX-QhoihLeI
skipping 1fDGejVDAHU

skipping nbZVFniKeOUskipping rfYZ1V8oa7kskipping OWuMapsvJ3w

skipping Src7bBpWhuw
skipping 0DzsPL-xElE
skipping an2mZpaPTpA
skipping Lx4poQw1mZo
skipping KrYQ4