hearbenchmark · jorshi · May 28, 2021 · May 24, 2021 · May 24, 2021 · May 24, 2021
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,9 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# Working directory for luigi pipelines
+evaluation-tasks/_workdir/
+
+# Completed evaluation taks
+evaluation-tasks/coughvid-*/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+-   repo: https://github.com/kynan/nbstripout
+    rev: 0.3.9
+    hooks:
+    -   id: nbstripout
+-   repo: https://github.com/mwouts/jupytext
+    rev: v1.11.2
+    hooks:
+    -   id: jupytext
+        args: [--sync, --pipe, black]
+        additional_dependencies:
+            - black==21.5b0 # Matches hook
+-   repo: https://github.com/psf/black
+    rev: 21.5b0
+    hooks:
+    -   id: black
+        language_version: python3
diff --git a/README.md b/README.md
@@ -1,2 +1,10 @@
 # hear2021-eval-kit
+
 Evaluation kit for HEAR 2021 NeurIPS competition
+
+
+If you are pushing code to this repo, please make sure you have
+pre-commit hooks installed:
+```
+pre-commit install
+```
diff --git a/evaluation-tasks/README.md b/evaluation-tasks/README.md
@@ -0,0 +1,77 @@
+evaluation-tasks
+================
+
+This folder contain Luigi pipelines to download and preprocess
+evaluation tasks into a common format. Luigi checkpoints are saved
+into directory .checkpoints so preprocessing can be resumed if
+interrupted. After preprocessing, tar'ed outputs are saved to your
+S3 bucket. This avoids hitting dataset providers repeatedly.
+
+For each evaluation task, the directory structure is:
+    taskname/
+        task.json
+        README
+        LICENSE
+        train.csv
+            [filename],...
+        test.csv
+            [filename],...
+        audio/[sr]/train/[filename]
+
+## More details
+
+task.json also specifies the hop_size that we will use for the
+evaluation.
+
+If this is a task involving multiple classes or labels, the max
+number of classes/labels will be provided. We might have two versions
+of label files, ones with strings and ones converted to ints for
+convenience.
+
+## train.csv and test.csv
+
+For classification/multi-classification of the entire sound:
+```
+filename, non-negative integer class
+```
+
+For tagging (multilabel sound event classification) of the entire sound:
+```
+filename, list of string labels
+```
+
+For frame-based temporal multilabel (e.g. transcription and sound event detection):
+```
+filename, float timestamp in seconds, list of string labels
+```
+
+For ranking tasks:
+```
+list of filenames in ranked order
+```
+
+For JND tasks:
+```
+filename1, filename2, 0/1 indicates whether the audio is perceptually different to human listeners.
+```
+
+If the dataset provides a validation.csv, that will be included
+too. Otherwise, participants do partition train into train/val
+however they like.
+
+## Caching with S3
+
+1. Download and configure the AWS CLI if you haven't done that already:
+    * [Intallation](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html)
+    * [Configuration](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html)
+
+2. Update the S3 config file: `config/s3.py`
+    * `S3_CACHE = True` enables S3 caching. Set this to False if you want to disable
+        caching for all tasks.
+    * `HANDLE` is a string that is used to create an S3 bucket for all the evaluation
+        tasks. Every S3 bucket must have a unique name, so you should use this to create
+        one for yourself. The value of `HANDLE` is appended to `hear2021-`. For example,
+        if I set `HANDLE=jordie` then all my tasks will be cached in a bucket named
+        `hear2021-jordie`.
+    * `S3_REGION_NAME` sets the region for your S3 buckets. You can set this to `None`
+        to use the default value set during CLI configuration.
diff --git a/evaluation-tasks/config/__init__.py b/evaluation-tasks/config/__init__.py
diff --git a/evaluation-tasks/config/coughvid.py b/evaluation-tasks/config/coughvid.py
@@ -0,0 +1,33 @@
+"""
+Configuration for the coughvid task
+"""
+
+# TODO: move some of these to a global config and import that here
+# See: https://github.com/neuralaudio/hear2021-eval-kit/issues/10
+
+TASKNAME = "coughvid-v2.0.0"
+
+# Number of CPU workers for Luigi jobs
+NUM_WORKERS = 4
+# NUM_WORKERS = 1
+# If you only use one sample rate, you should have an array with
+# one sample rate in it.
+# However, if you are evaluating multiple embeddings, you might
+# want them all.
+SAMPLE_RATES = [48000, 44100, 22050, 16000]
+# TODO: Pick the 75th percentile length?
+SAMPLE_LENGTH_SECONDS = 8.0
+# TODO: Do we want to call this FRAME_RATE or HOP_SIZE
+FRAME_RATE = 4
+# Set this to None if you want to use ALL the data.
+# NOTE: This will be, expected, 225 test files only :\
+# NOTE: You can make this smaller during development of this
+# preprocessing script, to keep the pipeline fast.
+# WARNING: If you change this value, you *must* delete _workdir
+# or working dir.
+# Most of the tasks iterate over every audio file present,
+# except for the one that downsamples the corpus.
+# (This is why we should have one working directory per task)
+MAX_FRAMES_PER_CORPUS = 20 * 3600
+
+MAX_FILES_PER_CORPUS = int(MAX_FRAMES_PER_CORPUS / FRAME_RATE / SAMPLE_LENGTH_SECONDS)
diff --git a/evaluation-tasks/config/s3.py b/evaluation-tasks/config/s3.py
@@ -0,0 +1,12 @@
+"""
+Configuration specific to AWS S3
+"""
+
+# You should pick a unique handle, since this determine the S3 path
+# (which must be globally unique across all S3 users).
+HANDLE = "hear"
+S3_BUCKET = f"hear2021-{HANDLE}"
+
+# If this is None, boto will use whatever is in your
+# ~/.aws/config or AWS_DEFAULT_REGION environment variable
+S3_REGION_NAME = "eu-central-1"