From 9e6ed419b04d3027d6411c4c11d4edfa5a4d9d7a Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Thu, 10 Aug 2023 17:24:55 +0200
Subject: [PATCH 01/52] Add draft for CfS

---
 CALL_FOR_SUBMISSIONS.md | 178 ++++++++++++++++++++++++++++++++++++++++
 README.md               |  50 ++++++++---
 RULES.md                |  78 ++++++------------
 getting_started.md      |  78 ++++++++++++------
 4 files changed, 293 insertions(+), 91 deletions(-)
 create mode 100644 CALL_FOR_SUBMISSIONS.md

diff --git a/CALL_FOR_SUBMISSIONS.md b/CALL_FOR_SUBMISSIONS.md
new file mode 100644
index 000000000..4f93c8996
--- /dev/null
+++ b/CALL_FOR_SUBMISSIONS.md
@@ -0,0 +1,178 @@
+# MLCommons™ AlgoPerf: Call for Submissions
+
+**Version:** 0.0.1 *(Last updated 10 August 2023)*
+
+- [MLCommons™ AlgoPerf: Call for Submissions](#mlcommons-algoperf-call-for-submissions)
+  - [Basics](#basics)
+  - [Schedule](#schedule)
+    - [Dates](#dates)
+    - [Code freeze](#code-freeze)
+    - [Submission deadline](#submission-deadline)
+  - [Submission](#submission)
+    - [Register a submission](#register-a-submission)
+    - [How to submit](#how-to-submit)
+      - [Submission repository](#submission-repository)
+      - [Licensing](#licensing)
+    - [Multiple Submission](#multiple-submission)
+    - [Requesting Additional Baselines](#requesting-additional-baselines)
+  - [Scoring](#scoring)
+    - [Self-reporting scores](#self-reporting-scores)
+      - [Verifying scores](#verifying-scores)
+    - [Sampling held-out workloads and hyperparameters](#sampling-held-out-workloads-and-hyperparameters)
+    - [Leaderboard](#leaderboard)
+  - [Sprit jury \& challenging submissions](#sprit-jury--challenging-submissions)
+  - [Awards and prize money](#awards-and-prize-money)
+    - [Awards committee](#awards-committee)
+  - [Ineligibility and conflict of interest](#ineligibility-and-conflict-of-interest)
+
+## Basics
+
+This is the call for submissions for the AlgoPerf: Training Algorithms Benchmark. The call describes the process of submitting a new training algorithm and details how it will be scored. This call applies to both the external tuning ruleset and the self-tuning ruleset although, for all intents and purposes, they are two separate competitions, with separate leaderboards.
+
+Three additional documents complement this call for submissions:
+
+- [**Benchmark rules**](RULES.md): While the call for submissions details the *logistical* aspects of submitting to the AlgoPerf: Training Algorithms Benchmark, the [rules document](RULES.md) describes the *scientific* rules of the competition. This includes, for example, how tuning is performed in each ruleset, what types of submissions are allowed, or how the benchmark score is computed.
+- [**AlgoPerf paper**](https://arxiv.org/abs/2306.07179): The paper titled ["Benchmarking Neural Network Training Algorithms"](https://arxiv.org/abs/2306.07179) motivates the need for the benchmark, explains the rules, and justifies the specific design choices of the AlgoPerf: Training Algorithms Benchmark. Additionally, it evaluates baseline submissions, constructed using various optimizers like Adam, Shampoo, or SAM, on the benchmark, demonstrating the feasibility but also the difficulty of the benchmark.
+- [**Benchmark codebase**](https://github.com/mlcommons/algorithmic-efficiency): The codebase implements the rules, provides exact specifications of the workloads, and it will ultimately be used to score submissions.
+
+## Schedule
+
+### Dates
+
+- **Publication of the call for submission: 01. September 2023 (08:00 AM UTC)**
+- Registration deadline for submissions: 01. November 2023 (08:00 AM UTC)
+- Code freeze for the benchmark codebase: 01. December 2023 (08:00 AM UTC)
+- **Submission deadline: 01. January 2024 (08:00 AM UTC)**
+- Sampling the held-out workloads and hyperparameters: 02. January 2024 (08:00 AM UTC)
+- Deadline for challenging submissions: 01. February 2024 (08:00 AM UTC)
+- Deadline for self-reporting results: 01. March 2024 (08:00 AM UTC)
+- **Publication of all results: 01. April 2024 (08:00 AM UTC)**
+
+The presented dates are subject to change and adjustments may be made by the [MLCommmons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/).
+
+### Code freeze
+
+The benchmark code base is subject to change after the call for proposals is published. For example, while interacting with the codebase, if submitters encounter bugs or API limitations, they have the option to issue a bug report. This might lead to modifications of the benchmark codebase even after the publication of the call for submissions.
+
+To ensure that all submitters can develop their submissions based on the exact same code that will be utilized for scoring, we will freeze the benchmark codebase before the submission deadline. By doing so, we level the playing field for everyone involved, ensuring fairness and consistency in the assessment of submissions. The code freeze also involves fixing all package versions of the codebase dependencies, such as JAX, PyTorch, etc.
+
+### Submission deadline
+
+With the submission deadline, all submissions need to be available as a *public* repository with the appropriate license (see the [Licensing section](#licensing)). No changes to the submission code are allowed after the submission deadline (with the notable exception of specifying the batch size for the - at that point unknown - held-out workloads). Once the submission deadline has passed, the working group will publish a list of all submitted algorithms, along with their associated repositories. Until the deadline for challenging submissions, anyone has the right to challenge a submission, i.e. request a review by the spirit jury to determine whether a submission violates the rules of the competition, see the [Spirit jury section](#sprit-jury--challenging-submissions).
+
+Directly after the submission deadline, all randomized aspects of the competition are fixed. This includes sampling the held-out workloads from the set of randomized workloads, as well as, sampling the hyperparameters for each submission in the external tuning ruleset (for more details see the [Sampling held-out workloads and hyperparameters section](#sampling-held-out-workloads-and-hyperparameters)). After that, submitters can now ascertain the appropriate batch size of their submission on each held-out workload and self-report scores on either the qualification set or the full benchmarking set of workloads including both fixed and held-out workloads (see the [Self-reporting scores section](#self-reporting-scores)).
+
+## Submission
+
+For a guide on the technical steps and details on how to write a submission, please refer to the [**Getting started document**](GETTING_STARTED.md). Additionally, the folders [/reference_algorithms](/reference_algorithms/) and [/baselines](/baselines/) provide example submissions that can serve as a template for creating new submissions.
+
+In the following, we describe the logistical steps required to submit a training algorithm to the AlgoPerf: Training Algorithms Benchmark.
+
+### Register a submission
+
+All submitters need to register an intent to submit before the submission registration deadline. This registration is mandatory, i.e. required for all submissions, but not binding, i.e. you don't have to submit a registered submission. This registration is necessary, to estimate the number of submissions and provide support for potential submitters.
+
+To register a submission, please write an email to <algorithms-chairs@mlcommons.org> with the subject "[Registration] *submission_name*" and the following information:
+
+- Name of the submission (e.g. name of the algorithm, or any other arbitrary identifier).
+- Ruleset under which the submission will be scored.
+- Name of all submitters associated with this submission.
+- Email of all submitters associated with this submission.
+- Affiliations of all submitters associated with this submission.
+
+In return, the submission will be issued a unique **submission ID** that will be used throughout the submission process.
+
+### How to submit
+
+Submitters have the flexibility to submit their training algorithm anytime between the registration of the submission and the submission deadline. To submit a submission, please write an email to <algorithms-chairs@mlcommons.org> with the subject "[Submission] *submission_ID*" and the following information:
+
+- Submission ID.
+- URL of the associated *public* GitHub repository.
+- If applicable, a list of all changes to the names, emails, or affiliations compared to the registration of the submission.
+- A digital version of all relevant licensing documents (see the [Licensing section](#licensing)).
+
+#### Submission repository
+
+The *public* GitHub repository needs to be a clone of the frozen `main` branch of the [benchmark codebase](https://github.com/mlcommons/algorithmic-efficiency). All elements of the original codebase,  except for the `/submission` directory need to be unaltered from the original benchmark code. In particular, the repository must use the same [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0) as the benchmark codebase. Once the submission deadline has passed, modifications of the submission repository's code are generally prohibited. The sole exception to this rule is the definition of the batch sizes for the held-out workloads.
+
+Any software dependencies required for the submission need to be defined in a `requirements.txt` file within the `/submission` directory. This file needs to be `pip` readable, i.e. installable via `pip install -r requirements.txt`. In order to comply with the rules, submissions are not allowed to modify the used package version of the software dependencies of the benchmarking codebase, e.g. by using a different version of PyTorch or JAX (see [](RULES.md#disallowed-submissions)).
+
+#### Licensing
+
+Submitting to the AlgoPerf: Training Algorithms Benchmark requires the following legal considerations:
+
+- A signed [Contributor License Agreement (CLA) "Corporate CLA"](https://mlcommons.org/en/policies/) of MLCommons.
+- *Either* a membership in MLCommons *or* a signed [non-member test agreement](https://mlcommons.org/en/policies/).
+- A signed trademark license agreement, either the member or the non-member version, as appropriate. These license agreements are available upon request to [support@mlcommons.org](mailto:support@mlcommons.org).
+
+We furthermore require all submissions to be made available open source after the submission deadline under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0).
+
+### Multiple Submission
+
+Our benchmark allows multiple submissions by the same submitter(s). However, we would like to prevent submitters from circumventing the purpose of the benchmark by, for example, submitting dozens of copies of the same submission with slightly different hyperparameters. Such a bulk submission would result in an unfair advantage on the randomized workloads and is not in the spirit of the benchmark.
+
+We encourage multiple submissions if they differ substantially. The spirit jury will be responsible for judging whether the submissions are substantially different. This jury will apply stricter scrutiny to submitters with a larger number of submissions. In this context, a submitter refers to an individual (not the general institution or research group they belong to). The total number of submissions by a submitter is the sum of submissions they contributed to.
+
+### Requesting Additional Baselines
+
+Submitters can both contribute and request additional baseline algorithms. This includes existing algorithms with different search spaces or learning rate schedules. These baselines will not be eligible for winning the competition or prize money.
+
+## Scoring
+
+### Self-reporting scores
+
+Submitters are expected to self-report scores on the full benchmark set before the deadline for self-reporting results. Reporting the scores involves providing all unmodified logs that the benchmarking codebase automatically generates in a separate `/results` directory within the `/submission` folder. For submissions competing in the external tuning ruleset, this includes all the logs of the tuning trials using the [hyperparameter samples provided by the working group](#sampling-held-out-workloads-and-hyperparameters). Note, that while the tuning runs can be performed on non-competition hardware, they still need to show that the "winning hyperparameter" in each study was selected according to the [tuning rules](/RULES.md#external-tuning-ruleset), i.e. the fastest hyperparameter to reach the validation target. Additionally, the logs of the "winning hyperparameter" (or each trial, in the self-tuning ruleset) in each of the five studies need to be computed on the competition hardware, to allow wall-clock runtime comparisons.
+
+Submitters unable to self-fund scoring costs can instead self-report only on the [qualification set of workloads](/RULES.md#qualification-set) that excludes some of the most expensive workloads. Based on this performance on the qualification set, the working group will provide compute to evaluate and score the most promising submissions. Additionally, we encourage researchers to reach out to the [working group](mailto:algorithms@mlcommons.org) to find potential collaborators with the resources to run larger, more comprehensive experiments for both developing and scoring submissions.
+
+#### Verifying scores
+
+The working group will independently verify the scores of the highest-scoring submissions in each ruleset. Results that have been verified by the working group will be clearly marked on the leaderboard.
+
+### Sampling held-out workloads and hyperparameters
+
+After the submission deadline has passed and all submission code is frozen, the working group will sample a specific instance of held-out workloads from the set of randomized workloads. Additionally, every submission in the external tuning ruleset will receive its specific set of 5x20 hyperparameter values grouped by study. This set of hyperparameter values is sampled from the search space provided by the submitters.
+
+The sampling code for the held-out workloads and the hyperparameters is publicly available (**TODO link to both functions!**). Both sampling functions take as input a random seed, which will be provided by a trusted third party after the submission deadline.
+
+### Leaderboard
+
+The publication of the results will contain two separate leaderboards, one for the self-tuning and one for the external tuning ruleset. All valid submissions will be ranked by the benchmark score, taking into account all workloads, including the held-out ones. The leaderboard will clearly mark scores that were verified by the working group.
+
+## Sprit jury & challenging submissions
+
+The spirit jury will be responsible for deciding whether a submission violates the "spirit of the rules". Until the deadline for challenging submissions, anyone has the right to challenge a submission, i.e. request a review by the spirit jury to determine whether a submission violates the rules of the competition. To challenge a submission, please write an email to <algorithms-chairs@mlcommons.org> with the subject "[Challenge] *submission_name*". The email can be written anonymously but it is required to link to the challenged submission and a detailed description of why the submission should be reviewed needs to be attached.
+
+The spirit jury may then hear the justifications of the submitters, inspect the code, and also ask the submitters to explain how the submission was produced, for example, by disclosing their intermediate experiments. Example cases that might be reviewed by the spirit jury are cases of multiple similar submissions by the same submitter or extensive workload-specific tuning.
+
+In the event of a review, the spirit jury will hold a vote, which will be decided by a simple majority.
+
+**TODO Who is on the Jury?**
+
+## Awards and prize money
+
+An awards committee will award a prize for the "*Best Performance*" in each ruleset as well as a "*Jury Award*". The prize for the best-performing submission will take into account the [benchmark score](RULES.md#benchmark-score-using-performance-profiles) on the full benchmark. The "*Jury Award*" will favor more out-of-the-box ideas that show great potential, even though the method may not be of practical value with the current landscape of models, software, etc.
+
+The prize money for "*Best Performance*" in a ruleset is $20,000 each. The winner of the "*Jury Award*" will be awarded $10,000. We reserve the right to split the prize money and distribute it among multiple submissions.
+
+### Awards committee
+
+The awards committee will be responsible for awarding prize money to submissions. Members of the awards committee can suggest submissions to be considered for the awards. The committee will vote on the winning submissions, the submission with the most votes in each respective category wins the awards, and if eligible, the prize money.
+
+**TODO Who is on the Awards committee?**
+
+## Ineligibility and conflict of interest
+
+To ensure a fair process and avoid conflicts of interest, some individuals and institutions are ineligible to win prize money. This includes:
+
+- The chairs of the MLCommons Algorithms Working Group (presently *George Dahl* and *Frank Schneider*) and their institutions (currently *Google Inc.* and the *University of Tübingen*)
+- All individuals serving on the awards committee and their institutions.
+
+A submission with at least one ineligible submitter may still win an award, but the prize money will then be awarded to the top-ranked submission that is eligible for prize money.
+
+Additionally, we require members of the spirit jury to abstain from being involved in a review if:
+
+- They are part of the reviewed submission.
+- The reviewed submission contains individuals from their institution.
+
+The spirit jury can still take a decision if at least one member of the jury is without a conflict of interest.
diff --git a/README.md b/README.md
index c60efae60..cbd28ed41 100644
--- a/README.md
+++ b/README.md
@@ -23,17 +23,23 @@
 [MLCommons Algorithmic Efficiency](https://mlcommons.org/en/groups/research-algorithms/) is a benchmark and competition measuring neural network training speedups due to algorithmic improvements in both training algorithms and models. This repository holds the [competition rules](RULES.md) and the benchmark code to run it. For a detailed description of the benchmark design, see our [paper](https://arxiv.org/abs/2306.07179).
 
 # Table of Contents
+
+- [MLCommons™ Algorithmic Efficiency](#mlcommons-algorithmic-efficiency)
 - [Table of Contents](#table-of-contents)
-- [AlgoPerf Benchmark Workloads](#algoperf-benchmark-workloads)
-- [Installation](#installation)
-   - [Docker](#docker)
+  - [Installation](#installation)
+  - [Virtual environment](#virtual-environment)
+  - [Docker](#docker)
+    - [Building Docker Image](#building-docker-image)
+    - [Running Docker Container (Interactive)](#running-docker-container-interactive)
+    - [Running Docker Container (End-to-end)](#running-docker-container-end-to-end)
 - [Getting Started](#getting-started)
+  - [Running a workload](#running-a-workload)
 - [Rules](#rules)
 - [Contributing](#contributing)
-- [Citing AlgoPerf Benchmark](#citing-algoperf-benchmark)
-
+- [Note on shared data pipelines between JAX and PyTorch](#note-on-shared-data-pipelines-between-jax-and-pytorch)
 
 ## Installation
+
 You can install this package and dependences in a [python virtual environment](#virtual-environment) or use a [Docker container](#install-in-docker) (recommended).
 
   *TL;DR to install the Jax version for GPU run:*
@@ -51,10 +57,13 @@ You can install this package and dependences in a [python virtual environment](#
    pip3 install -e '.[pytorch_gpu]' -f 'https://download.pytorch.org/whl/torch_stable.html'
    pip3 install -e '.[full]'
    ```
-##  Virtual environment
+
+## Virtual environment
+
 Note: Python minimum requirement >= 3.8
 
 To set up a virtual enviornment and install this repository
+
 1. Create new environment, e.g. via `conda` or `virtualenv`
 
    ```bash
@@ -87,16 +96,18 @@ or all workloads at once via
 ```bash
 pip3 install -e '.[full]'
 ```
+
 </details>
 
 ## Docker
-We recommend using a Docker container to ensure a similar environment to our scoring and testing environments. 
 
+We recommend using a Docker container to ensure a similar environment to our scoring and testing environments.
 
-**Prerequisites for NVIDIA GPU set up**: You may have to install the NVIDIA Container Toolkit so that the containers can locate the NVIDIA drivers and GPUs. 
+**Prerequisites for NVIDIA GPU set up**: You may have to install the NVIDIA Container Toolkit so that the containers can locate the NVIDIA drivers and GPUs.
 See instructions [here](https://github.com/NVIDIA/nvidia-docker).
 
 ### Building Docker Image
+
 1. Clone this repository
 
    ```bash
@@ -104,16 +115,19 @@ See instructions [here](https://github.com/NVIDIA/nvidia-docker).
    ```
 
 2. Build Docker Image
+
    ```bash
    cd `algorithmic-efficiency/docker`
    docker build -t <docker_image_name> . --build-args framework=<framework>
    ```
-   The `framework` flag can be either `pytorch`, `jax` or `both`. 
-   The `docker_image_name` is arbitrary.
 
+   The `framework` flag can be either `pytorch`, `jax` or `both`.
+   The `docker_image_name` is arbitrary.
 
 ### Running Docker Container (Interactive)
+
 1. Run detached Docker Container
+
    ```bash
    docker run -t -d \
       -v $HOME/data/:/data/ \
@@ -124,18 +138,24 @@ See instructions [here](https://github.com/NVIDIA/nvidia-docker).
       --ipc=host \
       <docker_image_name> 
    ```
-   This will print out a container id. 
+
+   This will print out a container id.
 2. Open a bash terminal
+
    ```bash
    docker exec -it <container_id> /bin/bash
    ```
 
 ### Running Docker Container (End-to-end)
+
 To run a submission end-to-end in a container see [Getting Started Document](./getting_started.md#run-your-submission-in-a-docker-container).
 
 # Getting Started
+
 For instructions on developing and scoring your own algorithm in the benchmark see [Getting Started Document](./getting_started.md).
+
 ## Running a workload
+
 To run a submission directly by running a Docker container, see [Getting Started Document](./getting_started.md#run-your-submission-in-a-docker-container).
 
 Alternatively from a your virtual environment or interactively running Docker container `submission_runner.py` run:
@@ -163,6 +183,7 @@ python3 submission_runner.py \
     --submission_path=reference_algorithms/development_algorithms/mnist/mnist_pytorch/submission.py \
     --tuning_search_space=reference_algorithms/development_algorithms/mnist/tuning_search_space.json
 ```
+
 <details>
 <summary>
 Using Pytorch DDP (Recommended)
@@ -176,11 +197,13 @@ torchrun --standalone --nnodes=1 --nproc_per_node=N_GPUS
 ```
 
 where `N_GPUS` is the number of available GPUs on the node. To only see output from the first process, you can run the following to redirect the output from processes 1-7 to a log file:
+
 ```bash
 torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 --standalone --nnodes=1 --nproc_per_node=8
  ```
 
 So the complete command is for example:
+
 ```
 torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 --standalone --nnodes=1 --nproc_per_node=8 \
 submission_runner.py \
@@ -191,15 +214,16 @@ submission_runner.py \
     --submission_path=reference_algorithms/development_algorithms/mnist/mnist_pytorch/submission.py \
     --tuning_search_space=reference_algorithms/development_algorithms/mnist/tuning_search_space.json \
 ```
-</details>
 
+</details>
 
 # Rules
+
 The rules for the MLCommons Algorithmic Efficency benchmark can be found in the seperate [rules document](RULES.md). Suggestions, clarifications and questions can be raised via pull requests.
 
 # Contributing
-If you are interested in contributing to the work of the working group, feel free to [join the weekly meetings](https://mlcommons.org/en/groups/research-algorithms/), open issues. See our [CONTRIBUTING.md](CONTRIBUTING.md) for MLCommons contributing guidelines and setup and workflow instructions.
 
+If you are interested in contributing to the work of the working group, feel free to [join the weekly meetings](https://mlcommons.org/en/groups/research-algorithms/), open issues. See our [CONTRIBUTING.md](CONTRIBUTING.md) for MLCommons contributing guidelines and setup and workflow instructions.
 
 # Note on shared data pipelines between JAX and PyTorch
 
diff --git a/RULES.md b/RULES.md
index 873cc1786..7691f4d5c 100644
--- a/RULES.md
+++ b/RULES.md
@@ -1,32 +1,30 @@
 # MLCommons™ AlgoPerf: Benchmark Rules
 
-**Version:** 0.0.16 *(Last updated 28 April 2023)*
+**Version:** 0.0.17 *(Last updated 10 August 2023)*
 
 > **TL;DR** New training algorithms and models can make neural net training faster.
 > We need a rigorous training time benchmark that measures time to result given a fixed hardware configuration and stimulates algorithmic progress. We propose a [Training Algorithm Track](#training-algorithm-track) and a [Model Track](#model-track) in order to help disentangle optimizer improvements and model architecture improvements. This two-track structure lets us enforce a requirement that new optimizers work well on multiple models and that new models aren't highly specific to particular training hacks.
 
-- [Introduction](#introduction)
-- [Training Algorithm Track](#training-algorithm-track)
-  - [Submissions](#submissions)
-    - [Specification](#specification)
-    - [Evaluation during training](#evaluation-during-training)
-    - [Valid submissions](#valid-submissions)
-  - [Tuning](#tuning)
-    - [External tuning ruleset](#external-tuning-ruleset)
-    - [Self-tuning ruleset](#self-tuning-ruleset)
-  - [Workloads](#workloads)
-    - [Fixed workloads](#fixed-workloads)
-    - [Randomized workloads](#randomized-workloads)
-    - [Qualification set](#qualification-set)
-  - [Scoring](#scoring)
-    - [Benchmarking hardware](#benchmarking-hardware)
-    - [Defining target performance](#defining-target-performance)
-    - [Benchmark score using performance profiles](#benchmark-score-using-performance-profiles)
-  - [Benchmark Procedure](#benchmark-procedure)
-    - [Multiple Submission](#multiple-submission)
-    - [Licensing](#licensing)
-    - [Awards and prize money](#awards-and-prize-money)
-- [Model Track](#model-track)
+- [MLCommons™ AlgoPerf: Benchmark Rules](#mlcommons-algoperf-benchmark-rules)
+  - [Introduction](#introduction)
+  - [Training Algorithm Track](#training-algorithm-track)
+    - [Submissions](#submissions)
+      - [Specification](#specification)
+      - [Evaluation during training](#evaluation-during-training)
+      - [Valid submissions](#valid-submissions)
+    - [Tuning](#tuning)
+      - [External tuning ruleset](#external-tuning-ruleset)
+      - [Self-tuning ruleset](#self-tuning-ruleset)
+    - [Workloads](#workloads)
+      - [Fixed workloads](#fixed-workloads)
+      - [Randomized workloads](#randomized-workloads)
+      - [Qualification set](#qualification-set)
+    - [Scoring](#scoring)
+      - [Benchmarking hardware](#benchmarking-hardware)
+      - [Defining target performance](#defining-target-performance)
+      - [Benchmark score using performance profiles](#benchmark-score-using-performance-profiles)
+    - [Benchmark Procedure](#benchmark-procedure)
+  - [Model Track](#model-track)
 
 ## Introduction
 
@@ -47,6 +45,8 @@ Submissions to the Training Algorithm Track can be entered under two separate ru
 
 The intention is that a training algorithm submission will be broadly applicable and useful without customization to the specific [workload](#workloads) (model, dataset, loss function). We want to discourage detecting the particular workload and doing something highly specific that isn't generally useful. In order to further discourage submissions that overfit to the particular [fixed benchmark workloads](#fixed-workloads), submissions will also be evaluated on [held-out workloads](#randomized-workloads) specified after the submission deadline.
 
+For a description of how to submit a training algorithm to the AlgoPerf: Training Algorithms Benchmark, see the [Call for submissions](CALL_FOR_SUBMISSIONS.md), which details the entire competition process.
+
 ### Submissions
 
 A valid submission is a piece of code that defines all of the submission functions and is able to train all benchmark workloads on the [benchmarking hardware](#benchmarking-hardware) (defined in the [Scoring](#scoring) section). Both the validation set and the test set performance will be checked regularly during training (see the [Evaluation during training](#evaluation-during-training) section). Training halts when the workload-specific [target errors](#defining-target-performance) for the validation and test sets have been reached. For each workload, the training time to reach the *test* set target error is used as input to the [scoring process](#scoring) for the submission. Submissions using [external tuning](#external-tuning-ruleset) will be tuned independently for each workload using a single workload-agnostic search space for their specified hyperparameters. The tuning trials are selected based on the time to reach the *validation* target, but only their training times to reach the *test* target will be used for scoring. Submissions under either tuning ruleset may always self-tune while on the clock.
@@ -400,7 +400,7 @@ Our scoring procedure uses the held-out workloads only to penalize submissions t
 
 #### Qualification set
 
-The qualification set is designed for submitters that may not have the compute resources to self-report on the full set of [fixed](#fixed-workloads) and [held-out workloads](#randomized-workloads). They may instead self-report numbers on this smaller qualification set. The best-performing submissions may then qualify for compute sponsorship offering a free evaluation on the full benchmark set and therefore the possibility to win [awards and prize money](#awards-and-prize-money).
+The qualification set is designed for submitters that may not have the compute resources to self-report on the full set of [fixed](#fixed-workloads) and [held-out workloads](#randomized-workloads). They may instead self-report numbers on this smaller qualification set. The best-performing submissions may then qualify for compute sponsorship offering a free evaluation on the full benchmark set and therefore the possibility to win [awards and prize money](/CALL_FOR_SUBMISSIONS.md#awards-and-prize-money).
 
 The qualification set consists of the same [fixed workloads](#fixed-workloads) as mentioned above, except for both workloads on *ImageNet*, both workloads on *LibriSpeech*, and the *fastMRI* workload. The remaining three workloads (*WMT*, *Criteo 1TB*, and *OGBG*) form the qualification set. There are no [randomized workloads](#randomized-workloads) in the qualification set. The qualification set of workloads aims to have a combined runtime of roughly 24 hours on the [benchmarking hardware](#benchmarking-hardware).
 
@@ -483,35 +483,7 @@ For a given workload $\bar{w}$, we define the "speedup of a submission $\bar{s}$
 
 ### Benchmark Procedure
 
-#### Multiple Submission
-
-Our benchmark allows multiple submissions by the same submitter. However, we would like to prevent submitters from circumventing the purpose of the benchmark by, for example, submitting dozens of copies of the same submission with slightly different hyperparameters. Such a bulk submission would result in an unfair advantage on the randomized workloads and is not in the spirit of the benchmark.
-
-We encourage multiple submissions if they differ substantially. A spirit jury will be responsible for judging whether the submissions are substantially different. This jury will apply stricter scrutiny to submitters with a larger number of submissions. In this context, a submitter refers to an individual (not the general institution or research group they belong to). The total number of submissions by a submitter is the sum of submissions they contributed to.
-
-##### Requesting Additional Baselines
-
-Submitters can both contribute and request additional baseline algorithms. This includes existing algorithms with different search spaces or learning rate schedules. These baselines will not be eligible for winning the competition or prize money.
-
-#### Licensing
-
-Submitting to the benchmark requires the following legal considerations:
-
-- A signed [Contributor License Agreement (CLA) "Corporate CLA"](https://mlcommons.org/en/policies/) of MLCommons.
-- *Either* membership in MLCommons *or* a signed [non-member test agreement](https://mlcommons.org/en/policies/).
-- A signed trademark license agreement, either the member or the non-member version, as appropriate). These license agreements are available upon request to [support@mlcommons.org](mailto:support@mlcommons.org).
-
-We furthermore require all submissions to be made available open source after the submission deadline under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0).
-
-#### Awards and prize money
-
-An awards committee will award a prize for the "*Best Performance*" in each ruleset as well as a "*Jury Award*". The prize for the best-performing submission will take into account the [benchmark score](#benchmark-score-using-performance-profiles) on the full benchmark. The "*Jury Award*" will favor more out-of-the-box ideas that show great potential, even though the method may not be of practical value with the current landscape of models, software, etc.
-
-The prize money for "*Best Performance*" in a ruleset is $20,000 each. The winner of the "*Jury Award*" will be awarded $10,000. We reserve the right to split the prize money and distribute it among multiple submissions.
-
-The chairs of the MLCommons Algorithms Working Group (presently *George Dahl* and *Frank Schneider*) and their institutions (currently *Google Inc.* and the *University of Tübingen*) are ineligible to receive prize money. In addition, all individuals serving on the awards committee and their institutions are ineligible to win prize money. A submission with at least one ineligible submitter may still win an award, but the prize money will then be awarded to the top-ranked submission that is eligible for prize money.
-
-Submitters may self-report the results of their submissions as long as they follow the benchmark protocol (e.g. use the time to reach the validation target for tuning, use the hyperparameter samples provided by the working group, etc.). The working group will independently verify the self-reported submissions with the highest scores. Only verified results are eligible to win the benchmark and be awarded prize money.
+For a description of how to submit a training algorithm to the AlgoPerf: Training Algorithms Benchmark, see the [Call for submissions](CALL_FOR_SUBMISSIONS.md), which details the entire competition process.
 
 ## Model Track
 
diff --git a/getting_started.md b/getting_started.md
index d6dd7fcd3..2ccfdfbd7 100644
--- a/getting_started.md
+++ b/getting_started.md
@@ -1,52 +1,68 @@
 # Getting Started
 
 Table of Contents:
-- [Set up  and installation](#workspace-set-up-and-installation)
-- [Download the data](#download-the-data)
-- [Develop your submission](#develop-your-submission)
-- [Run your submission](#run-your-submission)
-    - [Docker](#run-your-submission-in-a-docker-container)
-- [Score your submission](#score-your-submission)
+
+- [Getting Started](#getting-started)
+  - [Workspace set up and installation](#workspace-set-up-and-installation)
+  - [Download the data](#download-the-data)
+  - [Develop your submission](#develop-your-submission)
+    - [Set up your directory structure (Optional)](#set-up-your-directory-structure-optional)
+    - [Coding your submission](#coding-your-submission)
+  - [Run your submission](#run-your-submission)
+    - [Pytorch DDP](#pytorch-ddp)
+    - [Run your submission in a Docker container](#run-your-submission-in-a-docker-container)
+      - [Docker Tips](#docker-tips)
+  - [Score your submission](#score-your-submission)
+  - [Good Luck](#good-luck)
 
 ## Workspace set up and installation
+
 To get started you will have to make a few decisions and install the repository along with its dependencies. Specifically:
+
 1. Decide if you would like to develop your submission in either Pytorch or Jax.
-    2. Set up your workstation or VM. We recommend to use a setup similar to the [benchmarking hardware](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#benchmarking-hardware). 
-    The specs on the benchmarking machines are:
-    -  8 V100 GPUs
+2. Set up your workstation or VM. We recommend to use a setup similar to the [benchmarking hardware](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#benchmarking-hardware).
+  The specs on the benchmarking machines are:
+    - 8 V100 GPUs
     - 240 GB in RAM
-    - 2 TB in storage (for datasets). 
+    - 2 TB in storage (for datasets).
 3. Install the algorithmic package and dependencies, see [Installation](./README.md#installation).
 
 ## Download the data
-The workloads in this benchmark use 6 different datasets across 8 workloads. You may choose to download some or all of the datasets as you are developing your submission, but your submission will be scored across all 8 workloads. For instructions on obtaining and setting up the datasets see [datasets/README](https://github.com/mlcommons/algorithmic-efficiency/blob/main/datasets/README.md#dataset-setup).
 
+The workloads in this benchmark use 6 different datasets across 8 workloads. You may choose to download some or all of the datasets as you are developing your submission, but your submission will be scored across all 8 workloads. For instructions on obtaining and setting up the datasets see [datasets/README](https://github.com/mlcommons/algorithmic-efficiency/blob/main/datasets/README.md#dataset-setup).
 
 ## Develop your submission
+
 To develop a submission you will write a python module containing your optimizer algorithm. Your optimizer must implement a set of predefined API methods for the initialization and update steps.
 
 ### Set up your directory structure (Optional)
+
 Make a submissions subdirectory to store your submission modules e.g. `algorithmic-effiency/submissions/my_submissions`.
 
 ### Coding your submission
+
 You can find examples of sumbission modules under `algorithmic-efficiency/baselines` and `algorithmic-efficiency/reference_algorithms`. \
 A submission for the external ruleset will consist of a submission module and a tuning search space definition.
+
 1. Copy the template submission module `submissions/template/submission.py` into your submissions directory e.g. in `algorithmic-efficiency/my_submissions`.
 2. Implement at least the methods in the template submission module. Feel free to use helper functions and/or modules as you see fit. Make sure you adhere to to the competition rules. Check out the guidelines for [allowed submissions](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#disallowed-submissions), [disallowed submissions](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#disallowed-submissions) and pay special attention to the [software dependencies rule](https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md#software-dependencies).
 3. Add a tuning configuration e.g. `tuning_search_space.json` file to your submission directory. For the tuning search space you can either:
     1. Define the set of feasible points by defining a value for "feasible_points" for the hyperparameters:
-    ```
+
+    ```JSON
     {
         "learning_rate": {
             "feasible_points": 0.999
             },
     }
     ```
+
     For a complete example see [tuning_search_space.json](https://github.com/mlcommons/algorithmic-efficiency/blob/main/reference_algorithms/target_setting_algorithms/imagenet_resnet/tuning_search_space.json).
 
-    2. Define a range of values for quasirandom sampling by specifing a `min`, `max` and `scaling` 
+    2. Define a range of values for quasirandom sampling by specifing a `min`, `max` and `scaling`
     keys for the hyperparameter:
-    ```
+
+    ```JSON
     {
         "weight_decay": {
             "min": 5e-3, 
@@ -55,14 +71,15 @@ A submission for the external ruleset will consist of a submission module and a
             }
     }
     ```
-    For a complete example see [tuning_search_space.json](https://github.com/mlcommons/algorithmic-efficiency/blob/main/baselines/nadamw/tuning_search_space.json). 
 
+    For a complete example see [tuning_search_space.json](https://github.com/mlcommons/algorithmic-efficiency/blob/main/baselines/nadamw/tuning_search_space.json).
 
 ## Run your submission
 
 From your virtual environment or interactively running Docker container run your submission with `submission_runner.py`:  
 
-**JAX**: to score your submission on a workload, from the algorithmic-efficency directory run: 
+**JAX**: to score your submission on a workload, from the algorithmic-efficency directory run:
+
 ```bash
 python3 submission_runner.py \
     --framework=jax \
@@ -73,7 +90,8 @@ python3 submission_runner.py \
     --tuning_search_space=<path_to_tuning_search_space>
 ```
 
-**Pytorch**: to score your submission on a workload, from the algorithmic-efficency directory run: 
+**Pytorch**: to score your submission on a workload, from the algorithmic-efficency directory run:
+
 ```bash
 python3 submission_runner.py \
     --framework=pytorch \
@@ -85,13 +103,17 @@ python3 submission_runner.py \
 ```
 
 #### Pytorch DDP
-We recommend using PyTorch's [Distributed Data Parallel (DDP)](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) 
-when using multiple GPUs on a single node. You can initialize ddp with torchrun. 
+
+We recommend using PyTorch's [Distributed Data Parallel (DDP)](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html)
+when using multiple GPUs on a single node. You can initialize ddp with torchrun.
 For example, on single host with 8 GPUs simply replace `python3` in the above command by:
+
 ```bash
 torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 --standalone --nnodes=1 --nproc_per_node=N_GPUS
 ```
+
 So the complete command is:
+
 ```bash
 torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 \
     --standalone \
@@ -109,17 +131,18 @@ torchrun --redirects 1:0,2:0,3:0,4:0,5:0,6:0,7:0 \
 ### Run your submission in a Docker container
 
 The container entrypoint script provides the following flags:
+
 - `-d` dataset: can be 'imagenet', 'fastmri', 'librispeech', 'criteo1tb', 'wmt', or 'ogbg'. Setting this flag will download data if `~/data/<dataset>` does not exist on the host machine. Required for running a submission.
 - `-f` framework: can be either 'pytorch' or 'jax'. If you just want to download data, this flag is required for `-d imagenet` since we have two versions of data for imagenet. This flag is also required for running a submission.
-- `-s` submission_path: path to submission file on container filesystem. If this flag is set, the container will run a submission, so it is required for running a submission. 
+- `-s` submission_path: path to submission file on container filesystem. If this flag is set, the container will run a submission, so it is required for running a submission.
 - `-t` tuning_search_space: path to file containing tuning search space on container filesystem. Required for running a submission.
 - `-e` experiment_name: name of experiment. Required for running a submission.
 - `-w` workload: can be 'imagenet_resnet', 'imagenet_jax', 'librispeech_deepspeech', 'librispeech_conformer', 'ogbg', 'wmt', 'fastmri' or 'criteo1tb'. Required for running a submission.
 - `-m` max_steps: maximum number of steps to run the workload for. Optional.
-- `-b` debugging_mode: can be true or false. If `-b ` (debugging_mode) is `true` the main process on the container will persist.
-
+- `-b` debugging_mode: can be true or false. If `-b` (debugging_mode) is `true` the main process on the container will persist.
 
 To run the docker container that will run the submission runner run:
+
 ```bash
 docker run -t -d \
 -v $HOME/data/:/data/ \
@@ -136,31 +159,36 @@ docker run -t -d \
 -w <workload> \
 -b <debug_mode>
 ```
+
 This will print the container ID to the terminal.
 If debugging_mode is `true` the main process on the container will persist after finishing the submission runner.
 
 #### Docker Tips ####
 
 To find the container IDs of running containers
+
 ```
 docker ps 
 ```
 
 To see output of the entrypoint script
+
 ```
 docker logs <container_id> 
 ```
 
 To enter a bash session in the container
+
 ```
 docker exec -it <container_id> /bin/bash
 ```
 
-## Score your submission 
+## Score your submission
+
 To produce performance profile and performance table:
+
 ```bash
 python3 scoring/score_submission.py --experiment_path=<path_to_experiment_dir> --output_dir=<output_dir>
 ```
 
-
-## Good Luck!
+## Good Luck

From d326958d561ebe1ecc0b1e3c327712801ed4a43c Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Fri, 11 Aug 2023 10:00:20 +0200
Subject: [PATCH 02/52] update

---
 getting_started.md => GETTING_STARTED.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename getting_started.md => GETTING_STARTED.md (100%)

diff --git a/getting_started.md b/GETTING_STARTED.md
similarity index 100%
rename from getting_started.md
rename to GETTING_STARTED.md

From 705c9a5b3e348a659f7e36e1a0835412ceadf15a Mon Sep 17 00:00:00 2001
From: Frank <f.schneider@uni-tuebingen.de>
Date: Thu, 17 Aug 2023 21:28:23 +0200
Subject: [PATCH 03/52] Update and rename CALL_FOR_SUBMISSIONS.md to
 SUBMISSION_PROCESS_RULES.md

---
 ...MISSIONS.md => SUBMISSION_PROCESS_RULES.md | 40 +++++++++----------
 1 file changed, 19 insertions(+), 21 deletions(-)
 rename CALL_FOR_SUBMISSIONS.md => SUBMISSION_PROCESS_RULES.md (81%)

diff --git a/CALL_FOR_SUBMISSIONS.md b/SUBMISSION_PROCESS_RULES.md
similarity index 81%
rename from CALL_FOR_SUBMISSIONS.md
rename to SUBMISSION_PROCESS_RULES.md
index 4f93c8996..49910e2b1 100644
--- a/CALL_FOR_SUBMISSIONS.md
+++ b/SUBMISSION_PROCESS_RULES.md
@@ -1,8 +1,8 @@
-# MLCommons™ AlgoPerf: Call for Submissions
+# MLCommons™ AlgoPerf: Submission Process Rules
 
-**Version:** 0.0.1 *(Last updated 10 August 2023)*
+**Version:** 0.0.1 *(Last updated 17 August 2023)*
 
-- [MLCommons™ AlgoPerf: Call for Submissions](#mlcommons-algoperf-call-for-submissions)
+- [MLCommons™ AlgoPerf: Submission Process Rules](#mlcommons-algoperf-submission-process-rules)
   - [Basics](#basics)
   - [Schedule](#schedule)
     - [Dates](#dates)
@@ -27,11 +27,11 @@
 
 ## Basics
 
-This is the call for submissions for the AlgoPerf: Training Algorithms Benchmark. The call describes the process of submitting a new training algorithm and details how it will be scored. This call applies to both the external tuning ruleset and the self-tuning ruleset although, for all intents and purposes, they are two separate competitions, with separate leaderboards.
+This is the submission process rules for the AlgoPerf: Training Algorithms Benchmark. It describes the process of submitting a new training algorithm and details how it will be scored. This process applies to both the external tuning ruleset and the self-tuning ruleset although, for all intents and purposes, they are two separate competitions, with separate leaderboards.
 
-Three additional documents complement this call for submissions:
+Three additional documents complement this document:
 
-- [**Benchmark rules**](RULES.md): While the call for submissions details the *logistical* aspects of submitting to the AlgoPerf: Training Algorithms Benchmark, the [rules document](RULES.md) describes the *scientific* rules of the competition. This includes, for example, how tuning is performed in each ruleset, what types of submissions are allowed, or how the benchmark score is computed.
+- [**Benchmark rules**](RULES.md): While the submission process rules detail the *logistical* aspects of submitting to the AlgoPerf: Training Algorithms Benchmark, the [rules document](RULES.md) describes the *scientific* rules of the competition. This includes, for example, how tuning is performed in each ruleset, what types of submissions are allowed, or how the benchmark score is computed.
 - [**AlgoPerf paper**](https://arxiv.org/abs/2306.07179): The paper titled ["Benchmarking Neural Network Training Algorithms"](https://arxiv.org/abs/2306.07179) motivates the need for the benchmark, explains the rules, and justifies the specific design choices of the AlgoPerf: Training Algorithms Benchmark. Additionally, it evaluates baseline submissions, constructed using various optimizers like Adam, Shampoo, or SAM, on the benchmark, demonstrating the feasibility but also the difficulty of the benchmark.
 - [**Benchmark codebase**](https://github.com/mlcommons/algorithmic-efficiency): The codebase implements the rules, provides exact specifications of the workloads, and it will ultimately be used to score submissions.
 
@@ -39,22 +39,22 @@ Three additional documents complement this call for submissions:
 
 ### Dates
 
-- **Publication of the call for submission: 01. September 2023 (08:00 AM UTC)**
-- Registration deadline for submissions: 01. November 2023 (08:00 AM UTC)
-- Code freeze for the benchmark codebase: 01. December 2023 (08:00 AM UTC)
-- **Submission deadline: 01. January 2024 (08:00 AM UTC)**
-- Sampling the held-out workloads and hyperparameters: 02. January 2024 (08:00 AM UTC)
-- Deadline for challenging submissions: 01. February 2024 (08:00 AM UTC)
+- **Publication of the call for submission: 08. September 2023 (08:00 AM UTC)**
+- Registration deadline for submissions: 15. November 2023 (08:00 AM UTC)
+- Version freeze for the benchmark codebase: 01. December 2023 (08:00 AM UTC)
+- **Submission deadline: 15. January 2024 (08:00 AM UTC)**
+- Sampling the held-out workloads and hyperparameters: 16. January 2024 (08:00 AM UTC)
+- Deadline for specifying the submission batch sizes for held-out workloads: 23. January 2024 (08:00 AM UTC)
 - Deadline for self-reporting results: 01. March 2024 (08:00 AM UTC)
-- **Publication of all results: 01. April 2024 (08:00 AM UTC)**
+- **[extra tentative] Publication of all results: 15. April 2024 (08:00 AM UTC)**
 
 The presented dates are subject to change and adjustments may be made by the [MLCommmons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/).
 
-### Code freeze
+### Version freeze
 
-The benchmark code base is subject to change after the call for proposals is published. For example, while interacting with the codebase, if submitters encounter bugs or API limitations, they have the option to issue a bug report. This might lead to modifications of the benchmark codebase even after the publication of the call for submissions.
+The benchmark code base is subject to change after the call for submissions is published. For example, while interacting with the codebase, if submitters encounter bugs or API limitations, they have the option to issue a bug report. This might lead to modifications of the benchmark codebase even after the publication of the call for submissions.
 
-To ensure that all submitters can develop their submissions based on the exact same code that will be utilized for scoring, we will freeze the benchmark codebase before the submission deadline. By doing so, we level the playing field for everyone involved, ensuring fairness and consistency in the assessment of submissions. The code freeze also involves fixing all package versions of the codebase dependencies, such as JAX, PyTorch, etc.
+To ensure that all submitters can develop their submissions based on the same code that will be utilized for scoring, we will freeze the package versions of the codebase dependencies before the submission deadline. By doing so, we level the playing field for everyone involved, ensuring fairness and consistency in the assessment of submissions. We will also try to minimize changes to the benchmark codebase as best as possible.
 
 ### Submission deadline
 
@@ -105,7 +105,7 @@ Submitting to the AlgoPerf: Training Algorithms Benchmark requires the following
 - *Either* a membership in MLCommons *or* a signed [non-member test agreement](https://mlcommons.org/en/policies/).
 - A signed trademark license agreement, either the member or the non-member version, as appropriate. These license agreements are available upon request to [support@mlcommons.org](mailto:support@mlcommons.org).
 
-We furthermore require all submissions to be made available open source after the submission deadline under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0).
+We furthermore require all submissions to be made available open source on the submission deadline under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0).
 
 ### Multiple Submission
 
@@ -123,7 +123,7 @@ Submitters can both contribute and request additional baseline algorithms. This
 
 Submitters are expected to self-report scores on the full benchmark set before the deadline for self-reporting results. Reporting the scores involves providing all unmodified logs that the benchmarking codebase automatically generates in a separate `/results` directory within the `/submission` folder. For submissions competing in the external tuning ruleset, this includes all the logs of the tuning trials using the [hyperparameter samples provided by the working group](#sampling-held-out-workloads-and-hyperparameters). Note, that while the tuning runs can be performed on non-competition hardware, they still need to show that the "winning hyperparameter" in each study was selected according to the [tuning rules](/RULES.md#external-tuning-ruleset), i.e. the fastest hyperparameter to reach the validation target. Additionally, the logs of the "winning hyperparameter" (or each trial, in the self-tuning ruleset) in each of the five studies need to be computed on the competition hardware, to allow wall-clock runtime comparisons.
 
-Submitters unable to self-fund scoring costs can instead self-report only on the [qualification set of workloads](/RULES.md#qualification-set) that excludes some of the most expensive workloads. Based on this performance on the qualification set, the working group will provide compute to evaluate and score the most promising submissions. Additionally, we encourage researchers to reach out to the [working group](mailto:algorithms@mlcommons.org) to find potential collaborators with the resources to run larger, more comprehensive experiments for both developing and scoring submissions.
+Submitters unable to self-fund scoring costs can instead self-report only on the [qualification set of workloads](/RULES.md#qualification-set) that excludes some of the most expensive workloads. Based on this performance on the qualification set, the working group will provide - as funding allows - compute to evaluate and score the most promising submissions. Additionally, we encourage researchers to reach out to the [working group](mailto:algorithms@mlcommons.org) to find potential collaborators with the resources to run larger, more comprehensive experiments for both developing and scoring submissions.
 
 #### Verifying scores
 
@@ -141,12 +141,10 @@ The publication of the results will contain two separate leaderboards, one for t
 
 ## Sprit jury & challenging submissions
 
-The spirit jury will be responsible for deciding whether a submission violates the "spirit of the rules". Until the deadline for challenging submissions, anyone has the right to challenge a submission, i.e. request a review by the spirit jury to determine whether a submission violates the rules of the competition. To challenge a submission, please write an email to <algorithms-chairs@mlcommons.org> with the subject "[Challenge] *submission_name*". The email can be written anonymously but it is required to link to the challenged submission and a detailed description of why the submission should be reviewed needs to be attached.
+The spirit jury will be responsible for deciding whether a submission violates the "spirit of the rules". Submitters may challenge other submissions, i.e. request a review by the spirit jury to determine whether a submission violates the rules of the competition. To challenge a submission, please write an email to <algorithms-chairs@mlcommons.org> with the subject "[Challenge] *submission_name*". The email needs to link to the challenged submission and include a detailed description of why the submission should be reviewed.
 
 The spirit jury may then hear the justifications of the submitters, inspect the code, and also ask the submitters to explain how the submission was produced, for example, by disclosing their intermediate experiments. Example cases that might be reviewed by the spirit jury are cases of multiple similar submissions by the same submitter or extensive workload-specific tuning.
 
-In the event of a review, the spirit jury will hold a vote, which will be decided by a simple majority.
-
 **TODO Who is on the Jury?**
 
 ## Awards and prize money

From 2057116bc38014d961ef8aaa311cf404425ccf14 Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 11:24:35 +0200
Subject: [PATCH 04/52] Fix filename

---
 RULES.md | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/RULES.md b/RULES.md
index 7691f4d5c..7225a76b0 100644
--- a/RULES.md
+++ b/RULES.md
@@ -5,26 +5,25 @@
 > **TL;DR** New training algorithms and models can make neural net training faster.
 > We need a rigorous training time benchmark that measures time to result given a fixed hardware configuration and stimulates algorithmic progress. We propose a [Training Algorithm Track](#training-algorithm-track) and a [Model Track](#model-track) in order to help disentangle optimizer improvements and model architecture improvements. This two-track structure lets us enforce a requirement that new optimizers work well on multiple models and that new models aren't highly specific to particular training hacks.
 
-- [MLCommons™ AlgoPerf: Benchmark Rules](#mlcommons-algoperf-benchmark-rules)
-  - [Introduction](#introduction)
-  - [Training Algorithm Track](#training-algorithm-track)
-    - [Submissions](#submissions)
-      - [Specification](#specification)
-      - [Evaluation during training](#evaluation-during-training)
-      - [Valid submissions](#valid-submissions)
-    - [Tuning](#tuning)
-      - [External tuning ruleset](#external-tuning-ruleset)
-      - [Self-tuning ruleset](#self-tuning-ruleset)
-    - [Workloads](#workloads)
-      - [Fixed workloads](#fixed-workloads)
-      - [Randomized workloads](#randomized-workloads)
-      - [Qualification set](#qualification-set)
-    - [Scoring](#scoring)
-      - [Benchmarking hardware](#benchmarking-hardware)
-      - [Defining target performance](#defining-target-performance)
-      - [Benchmark score using performance profiles](#benchmark-score-using-performance-profiles)
-    - [Benchmark Procedure](#benchmark-procedure)
-  - [Model Track](#model-track)
+- [Introduction](#introduction)
+- [Training Algorithm Track](#training-algorithm-track)
+  - [Submissions](#submissions)
+    - [Specification](#specification)
+    - [Evaluation during training](#evaluation-during-training)
+    - [Valid submissions](#valid-submissions)
+  - [Tuning](#tuning)
+    - [External tuning ruleset](#external-tuning-ruleset)
+    - [Self-tuning ruleset](#self-tuning-ruleset)
+  - [Workloads](#workloads)
+    - [Fixed workloads](#fixed-workloads)
+    - [Randomized workloads](#randomized-workloads)
+    - [Qualification set](#qualification-set)
+  - [Scoring](#scoring)
+    - [Benchmarking hardware](#benchmarking-hardware)
+    - [Defining target performance](#defining-target-performance)
+    - [Benchmark score using performance profiles](#benchmark-score-using-performance-profiles)
+  - [Benchmark Procedure](#benchmark-procedure)
+- [Model Track](#model-track)
 
 ## Introduction
 
@@ -400,7 +399,7 @@ Our scoring procedure uses the held-out workloads only to penalize submissions t
 
 #### Qualification set
 
-The qualification set is designed for submitters that may not have the compute resources to self-report on the full set of [fixed](#fixed-workloads) and [held-out workloads](#randomized-workloads). They may instead self-report numbers on this smaller qualification set. The best-performing submissions may then qualify for compute sponsorship offering a free evaluation on the full benchmark set and therefore the possibility to win [awards and prize money](/CALL_FOR_SUBMISSIONS.md#awards-and-prize-money).
+The qualification set is designed for submitters that may not have the compute resources to self-report on the full set of [fixed](#fixed-workloads) and [held-out workloads](#randomized-workloads). They may instead self-report numbers on this smaller qualification set. The best-performing submissions may then qualify for compute sponsorship offering a free evaluation on the full benchmark set and therefore the possibility to win [awards and prize money](/SUBMISSION_PROCESS_RULES.md#awards-and-prize-money).
 
 The qualification set consists of the same [fixed workloads](#fixed-workloads) as mentioned above, except for both workloads on *ImageNet*, both workloads on *LibriSpeech*, and the *fastMRI* workload. The remaining three workloads (*WMT*, *Criteo 1TB*, and *OGBG*) form the qualification set. There are no [randomized workloads](#randomized-workloads) in the qualification set. The qualification set of workloads aims to have a combined runtime of roughly 24 hours on the [benchmarking hardware](#benchmarking-hardware).
 

From 2ff243116a703a28c9db3c79fbf4ce55346442d3 Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 11:33:22 +0200
Subject: [PATCH 05/52] Multiple submissions & additional baselines

---
 SUBMISSION_PROCESS_RULES.md | 48 ++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/SUBMISSION_PROCESS_RULES.md b/SUBMISSION_PROCESS_RULES.md
index 49910e2b1..308f11e05 100644
--- a/SUBMISSION_PROCESS_RULES.md
+++ b/SUBMISSION_PROCESS_RULES.md
@@ -2,28 +2,26 @@
 
 **Version:** 0.0.1 *(Last updated 17 August 2023)*
 
-- [MLCommons™ AlgoPerf: Submission Process Rules](#mlcommons-algoperf-submission-process-rules)
-  - [Basics](#basics)
-  - [Schedule](#schedule)
-    - [Dates](#dates)
-    - [Code freeze](#code-freeze)
-    - [Submission deadline](#submission-deadline)
-  - [Submission](#submission)
-    - [Register a submission](#register-a-submission)
-    - [How to submit](#how-to-submit)
-      - [Submission repository](#submission-repository)
-      - [Licensing](#licensing)
-    - [Multiple Submission](#multiple-submission)
-    - [Requesting Additional Baselines](#requesting-additional-baselines)
-  - [Scoring](#scoring)
-    - [Self-reporting scores](#self-reporting-scores)
-      - [Verifying scores](#verifying-scores)
-    - [Sampling held-out workloads and hyperparameters](#sampling-held-out-workloads-and-hyperparameters)
-    - [Leaderboard](#leaderboard)
-  - [Sprit jury \& challenging submissions](#sprit-jury--challenging-submissions)
-  - [Awards and prize money](#awards-and-prize-money)
-    - [Awards committee](#awards-committee)
-  - [Ineligibility and conflict of interest](#ineligibility-and-conflict-of-interest)
+- [Basics](#basics)
+- [Schedule](#schedule)
+  - [Dates](#dates)
+  - [Version freeze](#version-freeze)
+  - [Submission deadline](#submission-deadline)
+- [Submission](#submission)
+  - [Register a submission](#register-a-submission)
+  - [How to submit](#how-to-submit)
+    - [Submission repository](#submission-repository)
+    - [Licensing](#licensing)
+  - [Multiple Submission](#multiple-submission)
+- [Scoring](#scoring)
+  - [Self-reporting scores](#self-reporting-scores)
+    - [Verifying scores](#verifying-scores)
+  - [Sampling held-out workloads and hyperparameters](#sampling-held-out-workloads-and-hyperparameters)
+  - [Leaderboard](#leaderboard)
+- [Sprit jury \& challenging submissions](#sprit-jury--challenging-submissions)
+- [Awards and prize money](#awards-and-prize-money)
+  - [Awards committee](#awards-committee)
+- [Ineligibility and conflict of interest](#ineligibility-and-conflict-of-interest)
 
 ## Basics
 
@@ -111,11 +109,7 @@ We furthermore require all submissions to be made available open source on the s
 
 Our benchmark allows multiple submissions by the same submitter(s). However, we would like to prevent submitters from circumventing the purpose of the benchmark by, for example, submitting dozens of copies of the same submission with slightly different hyperparameters. Such a bulk submission would result in an unfair advantage on the randomized workloads and is not in the spirit of the benchmark.
 
-We encourage multiple submissions if they differ substantially. The spirit jury will be responsible for judging whether the submissions are substantially different. This jury will apply stricter scrutiny to submitters with a larger number of submissions. In this context, a submitter refers to an individual (not the general institution or research group they belong to). The total number of submissions by a submitter is the sum of submissions they contributed to.
-
-### Requesting Additional Baselines
-
-Submitters can both contribute and request additional baseline algorithms. This includes existing algorithms with different search spaces or learning rate schedules. These baselines will not be eligible for winning the competition or prize money.
+Submitters may submit algorithms marked as *baselines*. These might include existing algorithms with different search spaces or learning rate schedules. These baseline algorithms are not eligible for winning the competition or prize money but they are also not required to be "substantially different" from other submissions by the same submitters.
 
 ## Scoring
 

From 3298b4362cf34423eb7428a0d4144a778407a146 Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 11:48:15 +0200
Subject: [PATCH 06/52] winning hyperparameter configuration

---
 SUBMISSION_PROCESS_RULES.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SUBMISSION_PROCESS_RULES.md b/SUBMISSION_PROCESS_RULES.md
index 308f11e05..c176017d5 100644
--- a/SUBMISSION_PROCESS_RULES.md
+++ b/SUBMISSION_PROCESS_RULES.md
@@ -115,7 +115,7 @@ Submitters may submit algorithms marked as *baselines*. These might include exis
 
 ### Self-reporting scores
 
-Submitters are expected to self-report scores on the full benchmark set before the deadline for self-reporting results. Reporting the scores involves providing all unmodified logs that the benchmarking codebase automatically generates in a separate `/results` directory within the `/submission` folder. For submissions competing in the external tuning ruleset, this includes all the logs of the tuning trials using the [hyperparameter samples provided by the working group](#sampling-held-out-workloads-and-hyperparameters). Note, that while the tuning runs can be performed on non-competition hardware, they still need to show that the "winning hyperparameter" in each study was selected according to the [tuning rules](/RULES.md#external-tuning-ruleset), i.e. the fastest hyperparameter to reach the validation target. Additionally, the logs of the "winning hyperparameter" (or each trial, in the self-tuning ruleset) in each of the five studies need to be computed on the competition hardware, to allow wall-clock runtime comparisons.
+Submitters are expected to self-report scores on the full benchmark set before the deadline for self-reporting results. Reporting the scores involves providing all unmodified logs that the benchmarking codebase automatically generates in a separate `/results` directory within the `/submission` folder. For submissions competing in the external tuning ruleset, this includes all the logs of the tuning trials using the [hyperparameter samples provided by the working group](#sampling-held-out-workloads-and-hyperparameters). Note, that while the tuning runs can be performed on non-competition hardware, they still need to show that the "winning hyperparameter configuration" in each study was selected according to the [tuning rules](/RULES.md#external-tuning-ruleset), i.e. the fastest hyperparameter to reach the validation target. Additionally, the logs of the "winning hyperparameter configuration" (or each trial, in the self-tuning ruleset) in each of the five studies need to be computed on the competition hardware, to allow wall-clock runtime comparisons.
 
 Submitters unable to self-fund scoring costs can instead self-report only on the [qualification set of workloads](/RULES.md#qualification-set) that excludes some of the most expensive workloads. Based on this performance on the qualification set, the working group will provide - as funding allows - compute to evaluate and score the most promising submissions. Additionally, we encourage researchers to reach out to the [working group](mailto:algorithms@mlcommons.org) to find potential collaborators with the resources to run larger, more comprehensive experiments for both developing and scoring submissions.
 

From f9b50481678b6d884b62ed3864d52e3bd690246e Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 11:53:23 +0200
Subject: [PATCH 07/52] Specify challenging submissions

---
 SUBMISSION_PROCESS_RULES.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SUBMISSION_PROCESS_RULES.md b/SUBMISSION_PROCESS_RULES.md
index c176017d5..fce116050 100644
--- a/SUBMISSION_PROCESS_RULES.md
+++ b/SUBMISSION_PROCESS_RULES.md
@@ -135,7 +135,7 @@ The publication of the results will contain two separate leaderboards, one for t
 
 ## Sprit jury & challenging submissions
 
-The spirit jury will be responsible for deciding whether a submission violates the "spirit of the rules". Submitters may challenge other submissions, i.e. request a review by the spirit jury to determine whether a submission violates the rules of the competition. To challenge a submission, please write an email to <algorithms-chairs@mlcommons.org> with the subject "[Challenge] *submission_name*". The email needs to link to the challenged submission and include a detailed description of why the submission should be reviewed.
+The spirit jury will be responsible for deciding whether a submission violates the "spirit of the rules". Submitters with specific concerns about a particular submission can request a review by the spirit jury to determine whether a submission violates the rules of the competition. To challenge a submission, please write an email to <algorithms-chairs@mlcommons.org> with the subject "[Challenge] *submission_name*". The email needs to link to the challenged submission and include a detailed description of why the submission should be reviewed.
 
 The spirit jury may then hear the justifications of the submitters, inspect the code, and also ask the submitters to explain how the submission was produced, for example, by disclosing their intermediate experiments. Example cases that might be reviewed by the spirit jury are cases of multiple similar submissions by the same submitter or extensive workload-specific tuning.
 

From 0195e79ba8eae5440afb161cbda42aed3a4dfe46 Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 12:16:50 +0200
Subject: [PATCH 08/52] Prize money and challenge deadline

---
 SUBMISSION_PROCESS_RULES.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/SUBMISSION_PROCESS_RULES.md b/SUBMISSION_PROCESS_RULES.md
index fce116050..713046d04 100644
--- a/SUBMISSION_PROCESS_RULES.md
+++ b/SUBMISSION_PROCESS_RULES.md
@@ -56,7 +56,7 @@ To ensure that all submitters can develop their submissions based on the same co
 
 ### Submission deadline
 
-With the submission deadline, all submissions need to be available as a *public* repository with the appropriate license (see the [Licensing section](#licensing)). No changes to the submission code are allowed after the submission deadline (with the notable exception of specifying the batch size for the - at that point unknown - held-out workloads). Once the submission deadline has passed, the working group will publish a list of all submitted algorithms, along with their associated repositories. Until the deadline for challenging submissions, anyone has the right to challenge a submission, i.e. request a review by the spirit jury to determine whether a submission violates the rules of the competition, see the [Spirit jury section](#sprit-jury--challenging-submissions).
+With the submission deadline, all submissions need to be available as a *public* repository with the appropriate license (see the [Licensing section](#licensing)). No changes to the submission code are allowed after the submission deadline (with the notable exception of specifying the batch size for the - at that point unknown - held-out workloads). Once the submission deadline has passed, the working group will publish a list of all submitted algorithms, along with their associated repositories. Anyone has the right to challenge a submission, i.e. request a review by the spirit jury to determine whether a submission violates the rules of the competition, see the [Spirit jury section](#sprit-jury--challenging-submissions).
 
 Directly after the submission deadline, all randomized aspects of the competition are fixed. This includes sampling the held-out workloads from the set of randomized workloads, as well as, sampling the hyperparameters for each submission in the external tuning ruleset (for more details see the [Sampling held-out workloads and hyperparameters section](#sampling-held-out-workloads-and-hyperparameters)). After that, submitters can now ascertain the appropriate batch size of their submission on each held-out workload and self-report scores on either the qualification set or the full benchmarking set of workloads including both fixed and held-out workloads (see the [Self-reporting scores section](#self-reporting-scores)).
 
@@ -135,7 +135,7 @@ The publication of the results will contain two separate leaderboards, one for t
 
 ## Sprit jury & challenging submissions
 
-The spirit jury will be responsible for deciding whether a submission violates the "spirit of the rules". Submitters with specific concerns about a particular submission can request a review by the spirit jury to determine whether a submission violates the rules of the competition. To challenge a submission, please write an email to <algorithms-chairs@mlcommons.org> with the subject "[Challenge] *submission_name*". The email needs to link to the challenged submission and include a detailed description of why the submission should be reviewed.
+The spirit jury, consisting of selected active members of the working group, will be responsible for deciding whether a submission violates the "spirit of the rules". Submitters with specific concerns about a particular submission can request a review by the spirit jury to determine whether a submission violates the rules of the competition. To challenge a submission, please write an email to <algorithms-chairs@mlcommons.org> with the subject "[Challenge] *submission_name*". The email needs to link to the challenged submission and include a detailed description of why the submission should be reviewed. This request must be made reasonably in advance of the publication deadline to allow the Spirit Jury sufficient time to conduct a thorough review.
 
 The spirit jury may then hear the justifications of the submitters, inspect the code, and also ask the submitters to explain how the submission was produced, for example, by disclosing their intermediate experiments. Example cases that might be reviewed by the spirit jury are cases of multiple similar submissions by the same submitter or extensive workload-specific tuning.
 
@@ -147,9 +147,11 @@ An awards committee will award a prize for the "*Best Performance*" in each rule
 
 The prize money for "*Best Performance*" in a ruleset is $20,000 each. The winner of the "*Jury Award*" will be awarded $10,000. We reserve the right to split the prize money and distribute it among multiple submissions.
 
+If a submission is ineligible to win prize money it can still win an award. The prize money will then go to the highest-ranking eligible submission.
+
 ### Awards committee
 
-The awards committee will be responsible for awarding prize money to submissions. Members of the awards committee can suggest submissions to be considered for the awards. The committee will vote on the winning submissions, the submission with the most votes in each respective category wins the awards, and if eligible, the prize money.
+The awards committee will be responsible for awarding prize money to submissions. The committee will try to reach a consensus on how to award prize money and settle disagreements by majority vote, if necessary.
 
 **TODO Who is on the Awards committee?**
 

From e2043eaafbf761b830b7c791d9ef41c532f3b9d9 Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 12:19:09 +0200
Subject: [PATCH 09/52] Publication -> Announcement of results

---
 SUBMISSION_PROCESS_RULES.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/SUBMISSION_PROCESS_RULES.md b/SUBMISSION_PROCESS_RULES.md
index 713046d04..7b019e8a3 100644
--- a/SUBMISSION_PROCESS_RULES.md
+++ b/SUBMISSION_PROCESS_RULES.md
@@ -44,7 +44,7 @@ Three additional documents complement this document:
 - Sampling the held-out workloads and hyperparameters: 16. January 2024 (08:00 AM UTC)
 - Deadline for specifying the submission batch sizes for held-out workloads: 23. January 2024 (08:00 AM UTC)
 - Deadline for self-reporting results: 01. March 2024 (08:00 AM UTC)
-- **[extra tentative] Publication of all results: 15. April 2024 (08:00 AM UTC)**
+- **[extra tentative] Announcement of all results: 15. April 2024 (08:00 AM UTC)**
 
 The presented dates are subject to change and adjustments may be made by the [MLCommmons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/).
 
@@ -131,11 +131,11 @@ The sampling code for the held-out workloads and the hyperparameters is publicly
 
 ### Leaderboard
 
-The publication of the results will contain two separate leaderboards, one for the self-tuning and one for the external tuning ruleset. All valid submissions will be ranked by the benchmark score, taking into account all workloads, including the held-out ones. The leaderboard will clearly mark scores that were verified by the working group.
+The announcement of the results will contain two separate leaderboards, one for the self-tuning and one for the external tuning ruleset. All valid submissions will be ranked by the benchmark score, taking into account all workloads, including the held-out ones. The leaderboard will clearly mark scores that were verified by the working group.
 
 ## Sprit jury & challenging submissions
 
-The spirit jury, consisting of selected active members of the working group, will be responsible for deciding whether a submission violates the "spirit of the rules". Submitters with specific concerns about a particular submission can request a review by the spirit jury to determine whether a submission violates the rules of the competition. To challenge a submission, please write an email to <algorithms-chairs@mlcommons.org> with the subject "[Challenge] *submission_name*". The email needs to link to the challenged submission and include a detailed description of why the submission should be reviewed. This request must be made reasonably in advance of the publication deadline to allow the Spirit Jury sufficient time to conduct a thorough review.
+The spirit jury, consisting of selected active members of the working group, will be responsible for deciding whether a submission violates the "spirit of the rules". Submitters with specific concerns about a particular submission can request a review by the spirit jury to determine whether a submission violates the rules of the competition. To challenge a submission, please write an email to <algorithms-chairs@mlcommons.org> with the subject "[Challenge] *submission_name*". The email needs to link to the challenged submission and include a detailed description of why the submission should be reviewed. This request must be made reasonably in advance of the results announcement deadline to allow the Spirit Jury sufficient time to conduct a thorough review.
 
 The spirit jury may then hear the justifications of the submitters, inspect the code, and also ask the submitters to explain how the submission was produced, for example, by disclosing their intermediate experiments. Example cases that might be reviewed by the spirit jury are cases of multiple similar submissions by the same submitter or extensive workload-specific tuning.
 

From 0f106a35fcb6b2982d87a16af336c9e5cd87b215 Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 12:22:22 +0200
Subject: [PATCH 10/52] Remove todo for spirit jury

---
 SUBMISSION_PROCESS_RULES.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/SUBMISSION_PROCESS_RULES.md b/SUBMISSION_PROCESS_RULES.md
index 7b019e8a3..ff65bf3c8 100644
--- a/SUBMISSION_PROCESS_RULES.md
+++ b/SUBMISSION_PROCESS_RULES.md
@@ -139,8 +139,6 @@ The spirit jury, consisting of selected active members of the working group, wil
 
 The spirit jury may then hear the justifications of the submitters, inspect the code, and also ask the submitters to explain how the submission was produced, for example, by disclosing their intermediate experiments. Example cases that might be reviewed by the spirit jury are cases of multiple similar submissions by the same submitter or extensive workload-specific tuning.
 
-**TODO Who is on the Jury?**
-
 ## Awards and prize money
 
 An awards committee will award a prize for the "*Best Performance*" in each ruleset as well as a "*Jury Award*". The prize for the best-performing submission will take into account the [benchmark score](RULES.md#benchmark-score-using-performance-profiles) on the full benchmark. The "*Jury Award*" will favor more out-of-the-box ideas that show great potential, even though the method may not be of practical value with the current landscape of models, software, etc.

From e3f445d39f33328c2acf1f40f51d895fd1ccf3a6 Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 13:50:15 +0200
Subject: [PATCH 11/52] Update dates

---
 SUBMISSION_PROCESS_RULES.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/SUBMISSION_PROCESS_RULES.md b/SUBMISSION_PROCESS_RULES.md
index ff65bf3c8..2d7a891bd 100644
--- a/SUBMISSION_PROCESS_RULES.md
+++ b/SUBMISSION_PROCESS_RULES.md
@@ -1,6 +1,6 @@
 # MLCommons™ AlgoPerf: Submission Process Rules
 
-**Version:** 0.0.1 *(Last updated 17 August 2023)*
+**Version:** 0.0.2 *(Last updated 03 Oktober 2023)*
 
 - [Basics](#basics)
 - [Schedule](#schedule)
@@ -37,14 +37,14 @@ Three additional documents complement this document:
 
 ### Dates
 
-- **Publication of the call for submission: 08. September 2023 (08:00 AM UTC)**
-- Registration deadline for submissions: 15. November 2023 (08:00 AM UTC)
-- Version freeze for the benchmark codebase: 01. December 2023 (08:00 AM UTC)
-- **Submission deadline: 15. January 2024 (08:00 AM UTC)**
-- Sampling the held-out workloads and hyperparameters: 16. January 2024 (08:00 AM UTC)
-- Deadline for specifying the submission batch sizes for held-out workloads: 23. January 2024 (08:00 AM UTC)
-- Deadline for self-reporting results: 01. March 2024 (08:00 AM UTC)
-- **[extra tentative] Announcement of all results: 15. April 2024 (08:00 AM UTC)**
+- **Publication of the call for submission: 17. Oktober 2023 (08:00 AM UTC)**
+- Registration deadline for submissions: 15. December 2023 (08:00 AM UTC)
+- Version freeze for the benchmark codebase: 17. January 2024 (08:00 AM UTC)
+- **Submission deadline: 15. February 2024 (08:00 AM UTC)**
+- Sampling the held-out workloads and hyperparameters: 16. February 2024 (08:00 AM UTC)
+- Deadline for specifying the submission batch sizes for held-out workloads: 28. February 2024 (08:00 AM UTC)
+- Deadline for self-reporting results: 10. April 2024 (08:00 AM UTC)
+- **[extra tentative] Announcement of all results: 22. May 2024 (08:00 AM UTC)**
 
 The presented dates are subject to change and adjustments may be made by the [MLCommmons Algorithms Working Group](https://mlcommons.org/en/groups/research-algorithms/).
 

From 4b18cb7df28d1b453932f15743df02d350a103e9 Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 14:05:07 +0200
Subject: [PATCH 12/52] Add CfS placeholder

---
 CALL_FOR_SUBMISSIONS.md | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 CALL_FOR_SUBMISSIONS.md

diff --git a/CALL_FOR_SUBMISSIONS.md b/CALL_FOR_SUBMISSIONS.md
new file mode 100644
index 000000000..ecc7840e7
--- /dev/null
+++ b/CALL_FOR_SUBMISSIONS.md
@@ -0,0 +1,3 @@
+# MLCommons™ AlgoPerf: Call for Submissions
+
+🚧 **Coming soon!** 🚧

From 155101800af0eb31baf6bf3cf155a2a1e79201a6 Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 14:11:51 +0200
Subject: [PATCH 13/52] Formatting

---
 CONTRIBUTING.md | 146 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 95 insertions(+), 51 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 33a14f83c..771e77f0a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,28 @@
-# Contributing
+# MLCommons™ AlgoPerf: Contributing
+
+## Table of Contents <!-- omit from toc -->
+
+- [Setup](#setup)
+  - [Setting up a Linux VM on GCP](#setting-up-a-linux-vm-on-gcp)
+  - [Installing GPU Drivers](#installing-gpu-drivers)
+  - [Authentication for Google Cloud Container Registry](#authentication-for-google-cloud-container-registry)
+- [Installation](#installation)
+- [Docker workflows](#docker-workflows)
+  - [Pre-built Images on Google Cloud Container Registry](#pre-built-images-on-google-cloud-container-registry)
+  - [Trigger rebuild and push of maintained images](#trigger-rebuild-and-push-of-maintained-images)
+    - [Trigger build and push of images on other branch](#trigger-build-and-push-of-images-on-other-branch)
+  - [GCP Data and Experiment Integration](#gcp-data-and-experiment-integration)
+  - [Downloading Data from GCP](#downloading-data-from-gcp)
+  - [Saving Experiments to GCP](#saving-experiments-to-gcp)
+  - [Getting Information from a Container](#getting-information-from-a-container)
+  - [Mounting Local Repository](#mounting-local-repository)
+- [Submitting PRs](#submitting-prs)
+- [Testing](#testing)
+  - [Style Testing](#style-testing)
+  - [Unit and integration tests](#unit-and-integration-tests)
+  - [Regression tests](#regression-tests)
+
+We invite everyone to look through our rules and codebase and submit issues and pull requests, e.g. for rules changes, clarifications, or any bugs you might encounter. If you are interested in contributing to the work of the working group and influence the benchmark's design decisions, please [join the weekly meetings](https://mlcommons.org/en/groups/research-algorithms/) and consider becoming a member of the working group.
 
 The best way to contribute to the MLCommons is to get involved with one of our many project communities. You find more information about getting involved with MLCommons [here](https://mlcommons.org/en/get-involved/#getting-started).
 
@@ -8,29 +32,25 @@ To get started contributing code, you or your organization needs to sign the MLC
 
 MLCommons project work is tracked with issue trackers and pull requests. Modify the project in your own fork and issue a pull request once you want other developers to take a look at what you have done and discuss the proposed changes. Ensure that cla-bot and other checks pass for your Pull requests.
 
-# Table of Contents
-- [Setup](#setup) 
-- [Installation](#installation)
-- [Docker workflows](#docker-workflows)
-- [Submitting PRs](#submitting-prs)
-- [Testing](#testing)
+## Setup
 
+### Setting up a Linux VM on GCP
 
-# Setup 
-## Setting up a Linux VM on GCP
 If you want to run containers on GCP VMs or store and retrieve Docker images from the Google Cloud Container Registry, please read ahead.
 If you'd like to use a Linux VM, you will have to install the correct GPU drivers and the NVIDIA Docker toolkit.
 We recommmend to use the Deep Learning on Linux image. Further instructions are based on that.
 
 ### Installing GPU Drivers
+
 You can use the `scripts/cloud-startup.sh` as a startup script for the VM. This will automate the installation of the NVIDIA GPU Drivers and NVIDIA Docker toolkit.
 
 ### Authentication for Google Cloud Container Registry
+
 To access the Google Cloud Container Registry, you will have to authenticate to the repository whenever you use Docker.
 Use the gcloud credential helper as documented [here](https://cloud.google.com/artifact-registry/docs/docker/pushing-and-pulling#cred-helper).
 
+## Installation
 
-# Installation
 If you have not installed the package and dependencies yet see [Installation](./README.md#installation).
 
 To use the development tools such as `pytest` or `pylint` use the `dev` option:
@@ -42,39 +62,34 @@ pre-commit install
 
 To get an installation with the requirements for all workloads and development, use the argument `[full_dev]`.
 
+## Docker workflows
 
+We recommend developing in our Docker image to ensure a consistent environment between developing, testing and scoring submissions.
 
-# Docker workflows
-We recommend developing in our Docker image to ensure a consistent environment between developing, testing and scoring submissions. 
+To get started see also:
 
-To get started see:
-- [Installation with Docker](./README.md#docker) 
+- [Installation with Docker](./README.md#docker)
 - [Running a submission inside a Docker Container](./getting_started.md#run-your-submission-in-a-docker-container)
 
-Other resources:
-- [Pre-built Images on Google Cloud Container Registry](#pre-built-images-on-google-cloud-container-registry)
-- [GCP Data and Experiment Integration](#gcp-integration) 
-    - [Downloading Data from GCP](#downloading-data-from-gcp)
-    - [Saving Experiments Results to GCP](#saving-experiments-to-gcp)
-- [Getting Information from a Container](#getting-information-from-a-container)
-- [Mounting local repository](#mounting-local-repository)
+### Pre-built Images on Google Cloud Container Registry
 
-
-## Pre-built Images on Google Cloud Container Registry 
 If you want to maintain or use images stored on our Google Cloud Container Registry read this section.
 You will have to use an authentication helper to set up permissions to access the repository:
-```
+
+```bash
 ARTIFACT_REGISTRY_URL=us-central1-docker.pkg.dev
 gcloud auth configure-docker $ARTIFACT_REGISTRY_URL
 ```
 
 To pull the latest prebuilt image:
 
-```
+```bash
 docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/<image_name>
 ```
-The naming convention for `image_name` is `algoperf_<framework>_<branch>`. 
+
+The naming convention for `image_name` is `algoperf_<framework>_<branch>`.
 Currently maintained images on the repository are:
+
 - `algoperf_jax_main`
 - `algoperf_pytorch_main`
 - `algoperf_both_main`
@@ -82,32 +97,40 @@ Currently maintained images on the repository are:
 - `algoperf_pytorch_dev`
 - `algoperf_both_dev`
 
-To reference the pulled image you will have to use the full `image_path`, e.g. 
+To reference the pulled image you will have to use the full `image_path`, e.g.
 `us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main`.
 
 ### Trigger rebuild and push of maintained images
+
 To build and push all images (`pytorch`, `jax`, `both`) on maintained branches (`dev`, `main`).
-```
+
+```bash
 bash docker/build_docker_images.sh -b <branch>
 ```
 
 #### Trigger build and push of images on other branch
-You can also use the above script to build images from a different branch. 
+
+You can also use the above script to build images from a different branch.
+
 1. Push the branch to `mlcommons/algorithmic-efficiency` repository.
 2. Run
-   ```
+
+   ```bash
    bash docker/build_docker_images.sh -b <branch>
    ```
 
-## GCP Data and Experiment Integration
-The Docker entrypoint script can transfer data to and from 
+### GCP Data and Experiment Integration
+
+The Docker entrypoint script can transfer data to and from
 our GCP buckets on our internal GCP project. If
-you are an approved contributor you can get access to these resources to automatically download the datasets and upload experiment results. 
+you are an approved contributor you can get access to these resources to automatically download the datasets and upload experiment results.
 You can use these features by setting the `--internal_contributor` flag to 'true' for the Docker entrypoint script.
 
 ### Downloading Data from GCP
+
 To run a docker container that will only download data (if not found on host)
-```
+
+```bash
 docker run -t -d \
 -v $HOME/data/:/data/ \
 -v $HOME/experiment_runs/:/experiment_runs \
@@ -120,15 +143,18 @@ docker run -t -d \
 --keep_container_alive <keep_container_alive> \
 --internal_contributor true
 ```
+
 If `keep_container_alive` is `true` the main process on the container will persist after finishing the data download.
-This run command is useful if you are developing or debugging. 
+This run command is useful if you are developing or debugging.
 
 ### Saving Experiments to GCP
+
 If you set the internal collaborator mode to true
 experiments will also be automatically uploaded to our GCP bucket under `gs://mlcommons-runs/<experiment_name`.
 
 Command format
-```
+
+```bash
 docker run -t -d \
 -v $HOME/data/:/data/ \
 -v $HOME/experiment_runs/:/experiment_runs \
@@ -146,27 +172,33 @@ docker run -t -d \
 --internal_contributor true \
 ```
 
-## Getting Information from a Container
+### Getting Information from a Container
+
 To find the container IDs of running containers
-```
+
+```bash
 docker ps 
 ```
 
 To see the logging output
-```
+
+```bash
 docker logs <container_id> 
 ```
 
 To enter a bash session in the container
-```
+
+```bash
 docker exec -it <container_id> /bin/bash
 ```
 
-## Mounting Local Repository
+### Mounting Local Repository
+
 Rebuilding the docker image can become tedious if
 you are making frequent changes to the code.
-To have changes in your local copy of the algorithmic-efficiency repo be reflected inside the container you can mount the local repository with the `-v` flag. 
-```
+To have changes in your local copy of the algorithmic-efficiency repo be reflected inside the container you can mount the local repository with the `-v` flag.
+
+```bash
 docker run -t -d \
 -v $HOME/data/:/data/ \
 -v $HOME/experiment_runs/:/experiment_runs \
@@ -178,33 +210,40 @@ docker run -t -d \
 --keep_container_alive true 
 ```
 
-# Submitting PRs 
+## Submitting PRs
+
 New PRs will be merged on the dev branch by default, given that they pass the presubmits.
 
-# Testing
+## Testing
+
 We run tests with GitHub Actions, configured in the [.github/workflows](https://github.com/mlcommons/algorithmic-efficiency/tree/main/.github/workflows) folder.
 
-## Style Testing
+### Style Testing
+
 We run yapf and linting tests on PRs. You can view and fix offending errors with these instructions.
 
 To run the below commands, use the versions installed via `pip install -e '.[dev]'`.
 
 To automatically fix formatting errors, run the following (*WARNING:* this will edit your code, so it is suggested to make a git commit first!):
+
 ```bash
 yapf -i -r -vv -p algorithmic_efficiency baselines datasets reference_algorithms tests *.py
 ```
 
 To sort all import orderings, run the following:
+
 ```bash
 isort .
 ```
 
 To just print out all offending import orderings, run the following:
+
 ```bash
 isort . --check --diff
 ```
 
 To print out all offending pylint issues, run the following:
+
 ```bash
 pylint algorithmic_efficiency
 pylint baselines
@@ -214,20 +253,25 @@ pylint submission_runner.py
 pylint tests
 ```
 
-## Unit and integration tests
-We run unit tests and integration tests as part of the of github actions as well. 
+### Unit and integration tests
+
+We run unit tests and integration tests as part of the of github actions as well.
 You can also use `python tests/reference_algorithm_tests.py` to run a single model update and two model evals for each workload using the reference algorithm in `reference_algorithms/development_algorithms/`.
 
-## Regression tests
+### Regression tests
+
 We also have regression tests available in [.github/workflows/regression_tests.yml](https://github.com/mlcommons/algorithmic-efficiency/tree/main/.github/workflows/regression_tests.yml) that can be run semi-automatically.
-The regression tests are shorter end-to-end submissions run in a containerized environment across all 8 workloads, in both the jax and pytorch frameworks. 
+The regression tests are shorter end-to-end submissions run in a containerized environment across all 8 workloads, in both the jax and pytorch frameworks.
 The regression tests run on self-hosted runners and are triggered for pull requests that target the main branch. Typically these PRs will be from the `dev` branch
 so the tests will run containers based on images build from the `dev` branch.
 To run a regression test:
+
 1. Build and upload latest Docker images from dev branch.
-    ```
+
+    ```bash
     bash ~/algorithmic-efficiency/docker/build_docker_images.sh -b dev
     ```
+
 2. Turn on the self-hosted runner.
 3. Run the self-hosted runner application for the runner to accept jobs.
 4. Open a pull request into mian to trigger the workflow.

From ca921d184cd1a7d2fb787c130a282d587c462858 Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 14:12:03 +0200
Subject: [PATCH 14/52] Formatting

---
 SUBMISSION_PROCESS_RULES.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/SUBMISSION_PROCESS_RULES.md b/SUBMISSION_PROCESS_RULES.md
index 2d7a891bd..19e51d4d8 100644
--- a/SUBMISSION_PROCESS_RULES.md
+++ b/SUBMISSION_PROCESS_RULES.md
@@ -2,6 +2,8 @@
 
 **Version:** 0.0.2 *(Last updated 03 Oktober 2023)*
 
+## Table of Contents <!-- omit from toc -->
+
 - [Basics](#basics)
 - [Schedule](#schedule)
   - [Dates](#dates)

From 60bdde21ec13298f3ad5fc9bb3a199ec65b592ff Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 14:12:40 +0200
Subject: [PATCH 15/52] Formatting & increment

---
 RULES.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/RULES.md b/RULES.md
index 7225a76b0..b944ecc94 100644
--- a/RULES.md
+++ b/RULES.md
@@ -1,10 +1,12 @@
 # MLCommons™ AlgoPerf: Benchmark Rules
 
-**Version:** 0.0.17 *(Last updated 10 August 2023)*
+**Version:** 0.0.18 *(Last updated 03 Oktober 2023)*
 
 > **TL;DR** New training algorithms and models can make neural net training faster.
 > We need a rigorous training time benchmark that measures time to result given a fixed hardware configuration and stimulates algorithmic progress. We propose a [Training Algorithm Track](#training-algorithm-track) and a [Model Track](#model-track) in order to help disentangle optimizer improvements and model architecture improvements. This two-track structure lets us enforce a requirement that new optimizers work well on multiple models and that new models aren't highly specific to particular training hacks.
 
+## Table of Contents <!-- omit from toc -->
+
 - [Introduction](#introduction)
 - [Training Algorithm Track](#training-algorithm-track)
   - [Submissions](#submissions)

From a8506ddbf83718c95a2d6488cdf4da3be5f25049 Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 14:13:58 +0200
Subject: [PATCH 16/52] Formatting

---
 GETTING_STARTED.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md
index 8cab3959c..1369f5cc7 100644
--- a/GETTING_STARTED.md
+++ b/GETTING_STARTED.md
@@ -1,4 +1,6 @@
-# Getting Started
+# MLCommons™ AlgoPerf: Getting Started
+
+## Table of Contents <!-- omit from toc -->
 
 - [Set up and installation](#set-up-and-installation)
 - [Download the data](#download-the-data)

From c6e52d73bb55ca5eb85d1689e65a01d6c342e33b Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 14:15:35 +0200
Subject: [PATCH 17/52] Update ToC

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index 86d196208..58a62ebd5 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,7 @@
     - [Building Docker Image](#building-docker-image)
     - [Running Docker Container (Interactive)](#running-docker-container-interactive)
     - [Running Docker Container (End-to-end)](#running-docker-container-end-to-end)
+  - [Using Singularity/Apptainer instead of Docker](#using-singularityapptainer-instead-of-docker)
 - [Getting Started](#getting-started)
   - [Running a workload](#running-a-workload)
     - [JAX](#jax)
@@ -157,22 +158,29 @@ To use the Docker container as an interactive virtual environment, you can run a
 To run a submission end-to-end in a containerized environment see [Getting Started Document](./getting_started.md#run-your-submission-in-a-docker-container).
 
 ### Using Singularity/Apptainer instead of Docker
+
 Since many compute clusters don't allow the usage of Docker due to securtiy concerns and instead encourage the use of [Singularity/Apptainer](https://github.com/apptainer/apptainer) (formerly Singularity, now called Apptainer), we also provide instructions on how to build an Apptainer container based on the here provided Dockerfile.
 
 To convert the Dockerfile into an Apptainer definition file, we will use [spython](https://github.com/singularityhub/singularity-cli):
+
 ```bash
 pip3 install spython
 cd algorithmic-efficiency/docker
 spython recipe Dockerfile &> Singularity.def
 ```
+
 Now we can build the Apptainer image by running
+
 ```bash
 singularity build --fakeroot <singularity_image_name>.sif Singularity.def
 ```
+
 To start a shell session with GPU support (by using the `--nv` flag), we can run
+
 ```bash
 singularity shell --nv <singularity_image_name>.sif 
 ```
+
 Similarly to Docker, Apptainer allows you to bind specific paths on the host system and the container by specifying the `--bind` flag, as explained [here](https://docs.sylabs.io/guides/3.7/user-guide/bind_paths_and_mounts.html).
 
 ## Getting Started

From 5a39cf8998c3c8c09a3e9b7953b056484b4563dd Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 14:25:52 +0200
Subject: [PATCH 18/52] Update rules to exclude test set in scoring

---
 RULES.md | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/RULES.md b/RULES.md
index b944ecc94..d74525244 100644
--- a/RULES.md
+++ b/RULES.md
@@ -50,7 +50,7 @@ For a description of how to submit a training algorithm to the AlgoPerf: Trainin
 
 ### Submissions
 
-A valid submission is a piece of code that defines all of the submission functions and is able to train all benchmark workloads on the [benchmarking hardware](#benchmarking-hardware) (defined in the [Scoring](#scoring) section). Both the validation set and the test set performance will be checked regularly during training (see the [Evaluation during training](#evaluation-during-training) section). Training halts when the workload-specific [target errors](#defining-target-performance) for the validation and test sets have been reached. For each workload, the training time to reach the *test* set target error is used as input to the [scoring process](#scoring) for the submission. Submissions using [external tuning](#external-tuning-ruleset) will be tuned independently for each workload using a single workload-agnostic search space for their specified hyperparameters. The tuning trials are selected based on the time to reach the *validation* target, but only their training times to reach the *test* target will be used for scoring. Submissions under either tuning ruleset may always self-tune while on the clock.
+A valid submission is a piece of code that defines all of the submission functions and is able to train all benchmark workloads on the [benchmarking hardware](#benchmarking-hardware) (defined in the [Scoring](#scoring) section). Both the validation set and the test set performance will be checked regularly during training (see the [Evaluation during training](#evaluation-during-training) section), however, only the validation performance is relevant for scoring. Training halts when the workload-specific [target errors](#defining-target-performance) for the validation and test sets have been reached. For each workload, only the training time to reach the *validation* set target error is used as input to the [scoring process](#scoring) for the submission. Submissions using [external tuning](#external-tuning-ruleset) will be tuned independently for each workload using a single workload-agnostic search space for their specified hyperparameters. The tuning trials are selected based on the time to reach the *validation* target. Submissions under either tuning ruleset may always self-tune while on the clock.
 
 #### Specification
 
@@ -356,17 +356,17 @@ Tuning will be substantially different for the [external](#external-tuning-rules
 
 For each workload, the hyperparameters are tuned using $O=20$ tuning **trials**. To estimate the variance of the results, this tuning will be repeated for $S=5$ **studies**, for a total of $S\cdot O = 100$ different hyperparameter settings. The submitters will provide a workload-agnostic search space and the working group will then return $100$ hyperparameters settings obtained using [(quasi)random search](https://arxiv.org/abs/1706.03200). The working group will also randomly partition these $100$ trials into $5$ studies of $20$ trials each. In lieu of independent samples from a search space, submissions can instead supply a fixed list of $20$ hyper-parameter points that will be sampled without replacement.
 
-In each trial, the tuning trial with the fastest training time to achieve the *validation target* is determined among the $O=20$ hyperparameter settings. For scoring, however, we use the training time to reach the *test targets* of those $5$ selected runs. The median of these $5$ per-study training times will be the final training time for the submission on this workload and is used in the scoring procedure (see the "[Scoring submissions](#scoring)" section). In other words: We use the *validation performance* for tuning and selecting the best hyperparameter but use the *test performance* when measuring the training speed. Runs that do not reach the target performance of the evaluation metric have an infinite time. Submissions are always free to perform additional self-tuning while being timed.
+In each trial, the tuning trial with the fastest training time to achieve the *validation target* is determined among the $O=20$ hyperparameter settings. For scoring, we use this required training time to reach the *validation targets* of those $5$ selected runs. The median of these $5$ per-study training times will be the final training time for the submission on this workload and is used in the scoring procedure (see the "[Scoring submissions](#scoring)" section). Runs that do not reach the target performance of the evaluation metric have an infinite time. Submissions are always free to perform additional self-tuning while being timed.
 
 #### Self-tuning ruleset
 
 Submissions to this ruleset are not allowed to have user-defined hyperparameters. This ruleset allows both submissions that use the same hyperparameters for all workloads, including the randomized ones (e.g. Adam with default parameters), as well as submissions that perform inner-loop tuning during their training run (e.g. SGD with line searches).
 
-Submissions will run on one instance of the [benchmarking hardware](#benchmarking-hardware). As always, submissions are allowed to perform inner-loop tuning (e.g. for their learning rate) but the tuning efforts will be part of their score. A submission will run *S=5* times and its score will be the median time to reach the target evaluation metric value on the test set. To account for the lack of external tuning, submissions have a longer time budget to reach the target performance. Compared to the [external tuning ruleset](#external-tuning-ruleset), the `max_runtime` is tripled. Runs that do not reach the target performance of the evaluation metric within this allotted time budget have an infinite time.
+Submissions will run on one instance of the [benchmarking hardware](#benchmarking-hardware). As always, submissions are allowed to perform inner-loop tuning (e.g. for their learning rate) but the tuning efforts will be part of their score. A submission will run *S=5* times and its score will be the median time to reach the target evaluation metric value on the validation set. To account for the lack of external tuning, submissions have a longer time budget to reach the target performance. Compared to the [external tuning ruleset](#external-tuning-ruleset), the `max_runtime` is tripled. Runs that do not reach the target performance of the evaluation metric within this allotted time budget have an infinite time.
 
 ### Workloads
 
-For the purposes of the Training Algorithm Track, we consider a workload the combination of a `dataset`, `model`, `loss_fn`, along with a target that is defined over some evaluation metric. E.g., ResNet50 on ImageNet using the cross-entropy loss until a target error of 34.6% on the test set has been reached, would constitute a workload. The evaluation metric, in this example the misclassification error rate, is directly implied by the dataset/task.
+For the purposes of the Training Algorithm Track, we consider a workload the combination of a `dataset`, `model`, `loss_fn`, along with a target that is defined over some evaluation metric. E.g., ResNet50 on ImageNet using the cross-entropy loss until a target error of 22.6% on the validation set has been reached, would constitute a workload. The evaluation metric, in this example the misclassification error rate, is directly implied by the dataset/task.
 
 Submissions will be scored based on their performance on the [fixed workload](#fixed-workloads). However, additionally submissions must also perform resonably well on a set of [held-out workloads](#randomized-workloads) in order for their score on the fixed workload to count (for full details see the [Scoring](#scoring) section). These held-out workloads will be generated after the submission deadline, but their randomized generating process is publicly available with the call for submissions (see "[Randomized workloads](#randomized-workloads)" section).  
 
@@ -409,9 +409,9 @@ For the [external tuning ruleset](#external-tuning-ruleset), we will only use $1
 
 ### Scoring
 
-Submissions will be scored based on their required training time to reach the target performance on the test set of each workload. This target performance metric can be the same as the loss function but might also be a different workload-specific metric such as the error rate or BLEU score. The target performance was defined using four standard training algorithms, see the "[Defining target performance](#defining-target-performance)" section for more details. The training time of a submission includes the compilation times for computation graphs and ops that could happen just-in-time during training; all our benchmarks should be fast enough to compile so as not to dramatically impact overall performance. The overall ranking is then determined by summarizing the performances across all [fixed workloads](#fixed-workloads), using [performance profiles](#benchmark-score-using-performance-profiles), as explained below.
+Submissions will be scored based on their required training time to reach the target performance on the validation set of each workload. This target performance metric can be the same as the loss function but might also be a different workload-specific metric such as the error rate or BLEU score. The target performance was defined using four standard training algorithms, see the "[Defining target performance](#defining-target-performance)" section for more details. The training time of a submission includes the compilation times for computation graphs and ops that could happen just-in-time during training; all our benchmarks should be fast enough to compile so as not to dramatically impact overall performance. The overall ranking is then determined by summarizing the performances across all [fixed workloads](#fixed-workloads), using [performance profiles](#benchmark-score-using-performance-profiles), as explained below.
 
-While the training time to the *test set* target is used for scoring, we use the training time to the *validation set* target for tuning. This is only relevant for submissions in the [external tuning ruleset](#external-tuning-ruleset) but is also enforced for self-reported results (i.e. submissions in the self-reported ruleset must also reach the validation target in time but only the time to the test target is used for scoring). Submitters must select the hyperparameter setting that reached the *validation* target the fastest, irrespective of its training time to achieve the *test* target. This ensures a fair and practical procedure.
+The training time until the target performance on the test set was reached is not used in the scoring procedure but might be used for additional analysis of the competition results.
 
 #### Benchmarking hardware
 
@@ -430,7 +430,7 @@ Both [tuning rulesets](#tuning) will use the same target performances. The runti
 
 We will aggregate the training times of a submission on all fixed workloads using [Performance Profiles](http://www.argmin.net/2018/03/26/performance-profiles/) (originally from [Dolan and Moré](https://arxiv.org/abs/cs/0102001)). Below we surface several relevant definitions from their work for easier readability, before explaining how we integrate the performance profiles to reach a scalar benchmark score that will be used for ranking submissions.
 
-*Notation:* We have a set $\mathcal{S} = \{s_1, s_2, \dots, s_k\}$ of in total $k$ submissions that we evaluate on a set of $n$ fixed workloads: $\mathcal{W} = \{w_1, w_2, \dots, w_n\}$. For each submission $s$ and each workload $w$ we have a training time score $t_{s,w} \in [0,\infty)$. This is the time it took the submission to reach the test target performance on this particular workload.
+*Notation:* We have a set $\mathcal{S} = \{s_1, s_2, \dots, s_k\}$ of in total $k$ submissions that we evaluate on a set of $n$ fixed workloads: $\mathcal{W} = \{w_1, w_2, \dots, w_n\}$. For each submission $s$ and each workload $w$ we have a training time score $t_{s,w} \in [0,\infty)$. This is the time it took the submission to reach the validation target performance on this particular workload.
 
 ##### Computing performance ratios
 
@@ -466,10 +466,10 @@ The integral is normalized by the total integration area, with higher benchmark
 
 For the benchmark score, we compute and integrate the performance profiles using the training times of only the fixed workloads. But we use the submission's performance on the held-out workloads to penalize submissions. Specifically, if a submission is unable to train a held-out workload, we score the submission on the corresponding fixed workload as if that submission did not reach the target. In other words, for a submission to receive a finite training time on a fixed workload, it needs to:
 
-- Reach the validation and test target on the fixed workload within the maximum runtime.
-- Reach the validation and test target fixed workload within 4x of the fastest submission.
-- Reach the validation and test target on the held-out workload (corresponding to the fixed workload) within the maximum runtime.
-- Reach the validation and test target on the held-out workload (corresponding to the fixed workload) within 4x of the fastest submission. To determine the fastest submission on a held-out workload, we only consider submissions that reached the target on the corresponding fixed workload. This protects us against extremely fast submissions that only work on a specific held-out workload and are useless as general algorithms.
+- Reach the validation target on the fixed workload within the maximum runtime.
+- Reach the validation target fixed workload within 4x of the fastest submission.
+- Reach the validation target on the held-out workload (corresponding to the fixed workload) within the maximum runtime.
+- Reach the validation target on the held-out workload (corresponding to the fixed workload) within 4x of the fastest submission. To determine the fastest submission on a held-out workload, we only consider submissions that reached the target on the corresponding fixed workload. This protects us against extremely fast submissions that only work on a specific held-out workload and are useless as general algorithms.
 
 Only if all four requirements are met, does the submission get a finite score. Otherwise, a submission will receive a training time of infinity.
 

From f0e280a3f0797838545b1a78250c67fa46c27565 Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 3 Oct 2023 18:54:03 +0200
Subject: [PATCH 19/52] Add link to Google Form

---
 SUBMISSION_PROCESS_RULES.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SUBMISSION_PROCESS_RULES.md b/SUBMISSION_PROCESS_RULES.md
index 2d7a891bd..0049ac3a1 100644
--- a/SUBMISSION_PROCESS_RULES.md
+++ b/SUBMISSION_PROCESS_RULES.md
@@ -70,7 +70,7 @@ In the following, we describe the logistical steps required to submit a training
 
 All submitters need to register an intent to submit before the submission registration deadline. This registration is mandatory, i.e. required for all submissions, but not binding, i.e. you don't have to submit a registered submission. This registration is necessary, to estimate the number of submissions and provide support for potential submitters.
 
-To register a submission, please write an email to <algorithms-chairs@mlcommons.org> with the subject "[Registration] *submission_name*" and the following information:
+To register a submission, please fill out this [online form](https://forms.gle/iY1bUhwSjj1JZ4fa9) with the following information
 
 - Name of the submission (e.g. name of the algorithm, or any other arbitrary identifier).
 - Ruleset under which the submission will be scored.

From e7a907cf21770eabf8bc4520983b9c9e2c6c5995 Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 10 Oct 2023 16:18:10 +0200
Subject: [PATCH 20/52] Rename Jury Award

---
 SUBMISSION_PROCESS_RULES.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/SUBMISSION_PROCESS_RULES.md b/SUBMISSION_PROCESS_RULES.md
index 0049ac3a1..1fc5e7061 100644
--- a/SUBMISSION_PROCESS_RULES.md
+++ b/SUBMISSION_PROCESS_RULES.md
@@ -1,6 +1,6 @@
 # MLCommons™ AlgoPerf: Submission Process Rules
 
-**Version:** 0.0.2 *(Last updated 03 Oktober 2023)*
+**Version:** 0.0.3 *(Last updated 10 Oktober 2023)*
 
 - [Basics](#basics)
 - [Schedule](#schedule)
@@ -37,7 +37,7 @@ Three additional documents complement this document:
 
 ### Dates
 
-- **Publication of the call for submission: 17. Oktober 2023 (08:00 AM UTC)**
+- **Publication of the call for submission: 17. October 2023 (08:00 AM UTC)**
 - Registration deadline for submissions: 15. December 2023 (08:00 AM UTC)
 - Version freeze for the benchmark codebase: 17. January 2024 (08:00 AM UTC)
 - **Submission deadline: 15. February 2024 (08:00 AM UTC)**
@@ -141,9 +141,9 @@ The spirit jury may then hear the justifications of the submitters, inspect the
 
 ## Awards and prize money
 
-An awards committee will award a prize for the "*Best Performance*" in each ruleset as well as a "*Jury Award*". The prize for the best-performing submission will take into account the [benchmark score](RULES.md#benchmark-score-using-performance-profiles) on the full benchmark. The "*Jury Award*" will favor more out-of-the-box ideas that show great potential, even though the method may not be of practical value with the current landscape of models, software, etc.
+An awards committee will award a prize for the "*Best Performance*" in each ruleset as well as a "*Innovative Submission Award*". The prize for the best-performing submission will take into account the [benchmark score](RULES.md#benchmark-score-using-performance-profiles) on the full benchmark. The "*Innovative Submission Award*" will favor more out-of-the-box ideas that show great potential, even though the method may not be of practical value with the current landscape of models, software, etc.
 
-The prize money for "*Best Performance*" in a ruleset is $20,000 each. The winner of the "*Jury Award*" will be awarded $10,000. We reserve the right to split the prize money and distribute it among multiple submissions.
+The prize money for "*Best Performance*" in a ruleset is $20,000 each. The winner of the "*Innovative Submission Award*" will be awarded $10,000. We reserve the right to split the prize money and distribute it among multiple submissions.
 
 If a submission is ineligible to win prize money it can still win an award. The prize money will then go to the highest-ranking eligible submission.
 

From 7976442ccf7ad7cc0f74d8d6e906cf70e89f02fb Mon Sep 17 00:00:00 2001
From: Frank Schneider <frank.stefan.schneider@gmail.com>
Date: Tue, 10 Oct 2023 16:19:30 +0200
Subject: [PATCH 21/52] specify ineligible entities and associated institutions

---
 SUBMISSION_PROCESS_RULES.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/SUBMISSION_PROCESS_RULES.md b/SUBMISSION_PROCESS_RULES.md
index 1fc5e7061..a07f664b1 100644
--- a/SUBMISSION_PROCESS_RULES.md
+++ b/SUBMISSION_PROCESS_RULES.md
@@ -157,10 +157,10 @@ The awards committee will be responsible for awarding prize money to submissions
 
 To ensure a fair process and avoid conflicts of interest, some individuals and institutions are ineligible to win prize money. This includes:
 
-- The chairs of the MLCommons Algorithms Working Group (presently *George Dahl* and *Frank Schneider*) and their institutions (currently *Google Inc.* and the *University of Tübingen*)
-- All individuals serving on the awards committee and their institutions.
+- The chairs of the MLCommons Algorithms Working Group (presently *George Dahl* and *Frank Schneider*) and their associated institutions (currently *Google Inc.* and the *University of Tübingen*)
+- All individuals serving on the awards committee and their associated institutions.
 
-A submission with at least one ineligible submitter may still win an award, but the prize money will then be awarded to the top-ranked submission that is eligible for prize money.
+A submission with at least one participating ineligible entity may still win an award, but the prize money will then be given to the top-ranked submission that does not contain ineligible entities.
 
 Additionally, we require members of the spirit jury to abstain from being involved in a review if:
 

From 1bb385439ba4cc424430b82b29ce91552ddb47df Mon Sep 17 00:00:00 2001
From: Frank <f.schneider@uni-tuebingen.de>
Date: Wed, 18 Oct 2023 13:19:43 +0200
Subject: [PATCH 22/52] rephrase "register submission" to "intent to submit"

---
 SUBMISSION_PROCESS_RULES.md | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/SUBMISSION_PROCESS_RULES.md b/SUBMISSION_PROCESS_RULES.md
index a07f664b1..51aeff043 100644
--- a/SUBMISSION_PROCESS_RULES.md
+++ b/SUBMISSION_PROCESS_RULES.md
@@ -38,7 +38,7 @@ Three additional documents complement this document:
 ### Dates
 
 - **Publication of the call for submission: 17. October 2023 (08:00 AM UTC)**
-- Registration deadline for submissions: 15. December 2023 (08:00 AM UTC)
+- Registration deadline to express non-binding intent to submit: 15. December 2023 (08:00 AM UTC)
 - Version freeze for the benchmark codebase: 17. January 2024 (08:00 AM UTC)
 - **Submission deadline: 15. February 2024 (08:00 AM UTC)**
 - Sampling the held-out workloads and hyperparameters: 16. February 2024 (08:00 AM UTC)
@@ -66,19 +66,18 @@ For a guide on the technical steps and details on how to write a submission, ple
 
 In the following, we describe the logistical steps required to submit a training algorithm to the AlgoPerf: Training Algorithms Benchmark.
 
-### Register a submission
+### Register an intent to submit
 
 All submitters need to register an intent to submit before the submission registration deadline. This registration is mandatory, i.e. required for all submissions, but not binding, i.e. you don't have to submit a registered submission. This registration is necessary, to estimate the number of submissions and provide support for potential submitters.
 
-To register a submission, please fill out this [online form](https://forms.gle/iY1bUhwSjj1JZ4fa9) with the following information
+To register an intent to submission, please fill out this [online form](https://forms.gle/iY1bUhwSjj1JZ4fa9) with the following information
 
 - Name of the submission (e.g. name of the algorithm, or any other arbitrary identifier).
 - Ruleset under which the submission will be scored.
-- Name of all submitters associated with this submission.
-- Email of all submitters associated with this submission.
-- Affiliations of all submitters associated with this submission.
+- Name, email, and affiliations of all submitters associated with this submission.
+- Interest in compute support.
 
-In return, the submission will be issued a unique **submission ID** that will be used throughout the submission process.
+The submission will be issued a unique **submission ID** that will be used throughout the submission process.
 
 ### How to submit
 

From 515dc092a803505448ddc1ed0b692df2ad854daf Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Tue, 24 Oct 2023 00:01:28 +0000
Subject: [PATCH 23/52] add loss metric to min_eval_metrics registry

---
 scoring/scoring.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scoring/scoring.py b/scoring/scoring.py
index 12aae1357..3fd2a5f83 100644
--- a/scoring/scoring.py
+++ b/scoring/scoring.py
@@ -47,6 +47,7 @@
     'ctc_loss',
     'wer',
     'l1_loss',
+    'loss',
 ]
 
 MAX_EVAL_METRICS = ['average_precision', 'ssim', 'accuracy', 'bleu_score']

From 1b8c1dc7d529d596a938ee1829394a9d11840f25 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Tue, 24 Oct 2023 00:12:10 +0000
Subject: [PATCH 24/52] debugging

---
 scoring/scoring.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scoring/scoring.py b/scoring/scoring.py
index 3fd2a5f83..959965d8d 100644
--- a/scoring/scoring.py
+++ b/scoring/scoring.py
@@ -47,7 +47,6 @@
     'ctc_loss',
     'wer',
     'l1_loss',
-    'loss',
 ]
 
 MAX_EVAL_METRICS = ['average_precision', 'ssim', 'accuracy', 'bleu_score']
@@ -129,7 +128,7 @@ def get_index_that_reaches_target(workload_df,
   op = operator.le if is_minimized else operator.ge
   validation_target_reached = validation_series.apply(
       lambda x: op(x, validation_target))
-
+  print(validation_target_reached)
   target_reached = pd.Series(validation_target_reached[0])
   # Remove trials that never reach the target
   target_reached = target_reached[target_reached.apply(np.any)]

From 4441f321a3b9c89991d289d701e827a602cec3f9 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Tue, 24 Oct 2023 00:15:17 +0000
Subject: [PATCH 25/52] add loss to scoring registry

---
 scoring/scoring.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scoring/scoring.py b/scoring/scoring.py
index 959965d8d..2594efef6 100644
--- a/scoring/scoring.py
+++ b/scoring/scoring.py
@@ -47,6 +47,7 @@
     'ctc_loss',
     'wer',
     'l1_loss',
+    'loss',
 ]
 
 MAX_EVAL_METRICS = ['average_precision', 'ssim', 'accuracy', 'bleu_score']

From 5fd528a1aae1208147b1323e682efc4c383124f9 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Tue, 24 Oct 2023 00:16:40 +0000
Subject: [PATCH 26/52] fix index

---
 scoring/scoring.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scoring/scoring.py b/scoring/scoring.py
index 2594efef6..7d48a42c4 100644
--- a/scoring/scoring.py
+++ b/scoring/scoring.py
@@ -130,7 +130,7 @@ def get_index_that_reaches_target(workload_df,
   validation_target_reached = validation_series.apply(
       lambda x: op(x, validation_target))
   print(validation_target_reached)
-  target_reached = pd.Series(validation_target_reached[0])
+  target_reached = pd.Series(validation_target_reached)
   # Remove trials that never reach the target
   target_reached = target_reached[target_reached.apply(np.any)]
 

From 0146ecb89a63ed981c4d3b4b5fe6847bd642f587 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Tue, 24 Oct 2023 00:18:55 +0000
Subject: [PATCH 27/52] add map to max eval metrics

---
 scoring/scoring.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scoring/scoring.py b/scoring/scoring.py
index 7d48a42c4..c3fa79c52 100644
--- a/scoring/scoring.py
+++ b/scoring/scoring.py
@@ -50,7 +50,7 @@
     'loss',
 ]
 
-MAX_EVAL_METRICS = ['average_precision', 'ssim', 'accuracy', 'bleu_score']
+MAX_EVAL_METRICS = ['mean_average_precision', 'ssim', 'accuracy', 'bleu_score']
 
 
 def generate_eval_cols(metrics):

From a415e57d944858b321617e7efb04fb4f22c5360e Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Tue, 24 Oct 2023 00:20:55 +0000
Subject: [PATCH 28/52] add blue to max eval metrics

---
 scoring/scoring.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scoring/scoring.py b/scoring/scoring.py
index c3fa79c52..6a9974433 100644
--- a/scoring/scoring.py
+++ b/scoring/scoring.py
@@ -50,7 +50,7 @@
     'loss',
 ]
 
-MAX_EVAL_METRICS = ['mean_average_precision', 'ssim', 'accuracy', 'bleu_score']
+MAX_EVAL_METRICS = ['mean_average_precision', 'ssim', 'accuracy', 'bleu']
 
 
 def generate_eval_cols(metrics):

From 0d836cfba9fc3ed6b8f9fc3894248920e27dd77f Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Tue, 24 Oct 2023 00:21:47 +0000
Subject: [PATCH 29/52] remove print statement

---
 scoring/scoring.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scoring/scoring.py b/scoring/scoring.py
index 6a9974433..7e52bd08c 100644
--- a/scoring/scoring.py
+++ b/scoring/scoring.py
@@ -129,7 +129,6 @@ def get_index_that_reaches_target(workload_df,
   op = operator.le if is_minimized else operator.ge
   validation_target_reached = validation_series.apply(
       lambda x: op(x, validation_target))
-  print(validation_target_reached)
   target_reached = pd.Series(validation_target_reached)
   # Remove trials that never reach the target
   target_reached = target_reached[target_reached.apply(np.any)]

From 8a4f8fbb18ddfee676713cb6abfebfe5f32333e9 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Tue, 24 Oct 2023 00:40:58 +0000
Subject: [PATCH 30/52] debugging

---
 scoring/scoring.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scoring/scoring.py b/scoring/scoring.py
index 7e52bd08c..3c18cbcb3 100644
--- a/scoring/scoring.py
+++ b/scoring/scoring.py
@@ -355,6 +355,7 @@ def plot_performance_profiles(perf_df,
   Returns:
     None. If a valid save_dir is provided, save both the plot and perf_df.
   """
+  print(perf_df)
   fig = perf_df.T.plot(figsize=figsize)
   df_col_display = f'log10({df_col})' if scale == 'log' else df_col
   fig.set_xlabel(

From d70f4321885ad42a5c05606d55fa6547a2c0db56 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Tue, 24 Oct 2023 00:45:01 +0000
Subject: [PATCH 31/52] df

---
 scoring/scoring.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scoring/scoring.py b/scoring/scoring.py
index 3c18cbcb3..9b62f985a 100644
--- a/scoring/scoring.py
+++ b/scoring/scoring.py
@@ -34,6 +34,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from tabulate import tabulate
 
 import algorithmic_efficiency.workloads.workloads as workloads_registry
 
@@ -355,7 +356,7 @@ def plot_performance_profiles(perf_df,
   Returns:
     None. If a valid save_dir is provided, save both the plot and perf_df.
   """
-  print(perf_df)
+  print(tabulate(pef_df, headers='keys', tablefmt='psql'))  
   fig = perf_df.T.plot(figsize=figsize)
   df_col_display = f'log10({df_col})' if scale == 'log' else df_col
   fig.set_xlabel(

From 03ad1df9bf0a80b396c9ff1002dd82ee201da6ea Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Tue, 24 Oct 2023 00:47:24 +0000
Subject: [PATCH 32/52] fix

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index a7ce5ebb2..4c2d9e6d3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,6 +37,7 @@ install_requires =
   absl-py==1.4.0
   numpy>=1.23
   pandas>=2.0.1
+  tabulate==0.9.0
   tensorflow==2.12.0
   tensorflow-datasets==4.9.2
   tensorflow-probability==0.20.0

From 8a937ee02819f6a455f5aa6f31fbafbd84779934 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Tue, 24 Oct 2023 00:48:34 +0000
Subject: [PATCH 33/52] fix

---
 scoring/scoring.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scoring/scoring.py b/scoring/scoring.py
index 9b62f985a..b51be9bf5 100644
--- a/scoring/scoring.py
+++ b/scoring/scoring.py
@@ -356,7 +356,7 @@ def plot_performance_profiles(perf_df,
   Returns:
     None. If a valid save_dir is provided, save both the plot and perf_df.
   """
-  print(tabulate(pef_df, headers='keys', tablefmt='psql'))  
+  print(tabulate(perf_df, headers='keys', tablefmt='psql'))  
   fig = perf_df.T.plot(figsize=figsize)
   df_col_display = f'log10({df_col})' if scale == 'log' else df_col
   fig.set_xlabel(

From 87e0762b1d5ee824da1e3be8c6631479e9ce1e68 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Tue, 24 Oct 2023 01:00:26 +0000
Subject: [PATCH 34/52] verbosity

---
 scoring/score_submission.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scoring/score_submission.py b/scoring/score_submission.py
index 42a605dac..c25e6f960 100644
--- a/scoring/score_submission.py
+++ b/scoring/score_submission.py
@@ -32,7 +32,7 @@ def main(_):
       reference_submission_tag=None,
       num_points=100,
       scale='linear',
-      verbosity=0)
+      verbosity=1)
   if not os.path.exists(FLAGS.output_dir):
     os.mkdir(FLAGS.output_dir)
   scoring.plot_performance_profiles(

From 67232324d964e5f418dfc7fc5171d10a1993d860 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Tue, 24 Oct 2023 01:14:38 +0000
Subject: [PATCH 35/52] fix

---
 scoring/score_submission.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scoring/score_submission.py b/scoring/score_submission.py
index c25e6f960..4523966eb 100644
--- a/scoring/score_submission.py
+++ b/scoring/score_submission.py
@@ -32,7 +32,7 @@ def main(_):
       reference_submission_tag=None,
       num_points=100,
       scale='linear',
-      verbosity=1)
+      )
   if not os.path.exists(FLAGS.output_dir):
     os.mkdir(FLAGS.output_dir)
   scoring.plot_performance_profiles(

From e9d3c7d7ff0d78bdd942db6fcaf75bf5c116eb22 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Wed, 25 Oct 2023 20:35:46 +0000
Subject: [PATCH 36/52] debugging print statements

---
 scoring/score_submission.py |  1 +
 scoring/scoring.py          | 23 +++++++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/scoring/score_submission.py b/scoring/score_submission.py
index 4523966eb..b51886cf5 100644
--- a/scoring/score_submission.py
+++ b/scoring/score_submission.py
@@ -21,6 +21,7 @@
 
 def main(_):
   df = scoring_utils.get_experiment_df(FLAGS.experiment_path)
+  print(df)
   results = {
       FLAGS.submission_tag: df,
   }
diff --git a/scoring/scoring.py b/scoring/scoring.py
index b51be9bf5..fcf813b8b 100644
--- a/scoring/scoring.py
+++ b/scoring/scoring.py
@@ -140,7 +140,10 @@ def get_index_that_reaches_target(workload_df,
     return -1, -1
   else:
     index_reached = target_reached.apply(np.argmax)
+    print(index_reached)
     trial = index_reached.idxmin()
+    print(trial)
+    print(index_reached)
     return trial, index_reached[trial]
 
 
@@ -165,6 +168,7 @@ def get_times_for_submission(submission,
 
   for workload, group in submission.groupby('workload'):
     workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1)
+    print(workload_name)
     framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2)
     workload_metadata = WORKLOADS[workload_name]
 
@@ -268,18 +272,19 @@ def compute_performance_profiles(results,
 
   if verbosity > 0:
     print(f'\n`{time_col}` to reach target normalized to best:')
-    with pd.option_context('display.max_rows',
-                           None,
-                           'display.max_columns',
-                           None,
-                           'display.width',
-                           1000):
-      print(df)
+    # with pd.option_context('display.max_rows',
+    #                        None,
+    #                        'display.max_columns',
+    #                        None,
+    #                        'display.width',
+    #                        1000):
+      # print(df)
 
   # If no max_tau is supplied, choose the value of tau that would plot all non
   # inf or nan data.
   if max_tau is None:
     max_tau = df.replace(float('inf'), -1).replace(np.nan, -1).values.max()
+    print(f"MAX TAU: {max_tau}")
 
   if scale == 'linear':
     points = np.linspace(min_tau, max_tau, num=num_points)
@@ -356,7 +361,9 @@ def plot_performance_profiles(perf_df,
   Returns:
     None. If a valid save_dir is provided, save both the plot and perf_df.
   """
-  print(tabulate(perf_df, headers='keys', tablefmt='psql'))  
+  print("PERF DF")
+  print(perf_df.columns)  
+  print(perf_df.T)
   fig = perf_df.T.plot(figsize=figsize)
   df_col_display = f'log10({df_col})' if scale == 'log' else df_col
   fig.set_xlabel(

From 4df526b7b293719855be42bddd98db6be20960a0 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Mon, 30 Oct 2023 20:17:33 +0000
Subject: [PATCH 37/52] remove debugging print statements

---
 scoring/score_submission.py |  3 +--
 scoring/scoring.py          | 22 +++++++---------------
 2 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/scoring/score_submission.py b/scoring/score_submission.py
index b51886cf5..42a605dac 100644
--- a/scoring/score_submission.py
+++ b/scoring/score_submission.py
@@ -21,7 +21,6 @@
 
 def main(_):
   df = scoring_utils.get_experiment_df(FLAGS.experiment_path)
-  print(df)
   results = {
       FLAGS.submission_tag: df,
   }
@@ -33,7 +32,7 @@ def main(_):
       reference_submission_tag=None,
       num_points=100,
       scale='linear',
-      )
+      verbosity=0)
   if not os.path.exists(FLAGS.output_dir):
     os.mkdir(FLAGS.output_dir)
   scoring.plot_performance_profiles(
diff --git a/scoring/scoring.py b/scoring/scoring.py
index fcf813b8b..0c076bea4 100644
--- a/scoring/scoring.py
+++ b/scoring/scoring.py
@@ -140,10 +140,7 @@ def get_index_that_reaches_target(workload_df,
     return -1, -1
   else:
     index_reached = target_reached.apply(np.argmax)
-    print(index_reached)
     trial = index_reached.idxmin()
-    print(trial)
-    print(index_reached)
     return trial, index_reached[trial]
 
 
@@ -168,7 +165,6 @@ def get_times_for_submission(submission,
 
   for workload, group in submission.groupby('workload'):
     workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1)
-    print(workload_name)
     framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2)
     workload_metadata = WORKLOADS[workload_name]
 
@@ -272,19 +268,18 @@ def compute_performance_profiles(results,
 
   if verbosity > 0:
     print(f'\n`{time_col}` to reach target normalized to best:')
-    # with pd.option_context('display.max_rows',
-    #                        None,
-    #                        'display.max_columns',
-    #                        None,
-    #                        'display.width',
-    #                        1000):
-      # print(df)
+    with pd.option_context('display.max_rows',
+                           None,
+                           'display.max_columns',
+                           None,
+                           'display.width',
+                           1000):
+      print(df)
 
   # If no max_tau is supplied, choose the value of tau that would plot all non
   # inf or nan data.
   if max_tau is None:
     max_tau = df.replace(float('inf'), -1).replace(np.nan, -1).values.max()
-    print(f"MAX TAU: {max_tau}")
 
   if scale == 'linear':
     points = np.linspace(min_tau, max_tau, num=num_points)
@@ -361,9 +356,6 @@ def plot_performance_profiles(perf_df,
   Returns:
     None. If a valid save_dir is provided, save both the plot and perf_df.
   """
-  print("PERF DF")
-  print(perf_df.columns)  
-  print(perf_df.T)
   fig = perf_df.T.plot(figsize=figsize)
   df_col_display = f'log10({df_col})' if scale == 'log' else df_col
   fig.set_xlabel(

From 152cf64a47f7fb60819f1c97cda7de78fa52f35e Mon Sep 17 00:00:00 2001
From: priyakasimbeg <priyakasimbeg@users.noreply.github.com>
Date: Mon, 30 Oct 2023 14:18:48 -0700
Subject: [PATCH 38/52] update fastmri targets (#548)

* update fastmri targets

* update targets

* update targets
---
 algorithmic_efficiency/workloads/fastmri/workload.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/algorithmic_efficiency/workloads/fastmri/workload.py b/algorithmic_efficiency/workloads/fastmri/workload.py
index 4677dc2bb..d1d07e70e 100644
--- a/algorithmic_efficiency/workloads/fastmri/workload.py
+++ b/algorithmic_efficiency/workloads/fastmri/workload.py
@@ -19,14 +19,14 @@ def has_reached_validation_target(self, eval_result: float) -> bool:
 
   @property
   def validation_target_value(self) -> float:
-    return 0.7344
+    return 0.726999
 
   def has_reached_test_target(self, eval_result: float) -> bool:
     return eval_result['test/ssim'] > self.test_target_value
 
   @property
   def test_target_value(self) -> float:
-    return 0.741652
+    return 0.744254
 
   @property
   def loss_type(self) -> spec.LossType:

From 119f8d7b784690f1d14425f84fac8ce92abc14b0 Mon Sep 17 00:00:00 2001
From: priyakasimbeg <priyakasimbeg@users.noreply.github.com>
Date: Tue, 31 Oct 2023 23:39:54 +0000
Subject: [PATCH 39/52] add flag for setting max split size

---
 README.md            | 5 +++++
 submission_runner.py | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/README.md b/README.md
index 6ffbab6f7..de8ea060d 100644
--- a/README.md
+++ b/README.md
@@ -126,8 +126,13 @@ To use the Docker container as an interactive virtual environment, you can run a
       -v $HOME/algorithmic-efficiency:/algorithmic-efficiency \
       --gpus all \
       --ipc=host \
+<<<<<<< HEAD
+      <docker_image_name> \
+      -keep_container_alive true
+=======
       <docker_image_name> \
       --keep_container_alive true
+>>>>>>> ba5c6f6175a0ce12f23a7f035613d9d1edc0b74a
    ```
    Note: You may have to use double quotes around `algorithmic-efficiency` [path] in the mounting `-v` flag. If the above command fails try replacing the following line:
    ```bash
diff --git a/submission_runner.py b/submission_runner.py
index 656599a42..6d4cc98e2 100644
--- a/submission_runner.py
+++ b/submission_runner.py
@@ -149,6 +149,11 @@
     None,
     'Value of rng seed. If None, a random seed will'
     'be generated from hardware.')
+flags.DEFINE_boolean(
+    'set_pytorch_max_split_size', 
+    None,
+    'If true, set pytorch max_split_size_mb to 256'
+)
 FLAGS = flags.FLAGS
 USE_PYTORCH_DDP, RANK, DEVICE, N_GPUS = pytorch_setup()
 
@@ -601,6 +606,9 @@ def main(_):
   # Prevent OOM on librispeech conformer.
   if FLAGS.workload == 'librispeech_conformer':
     os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '0.85'
+  
+  if FLAGS.set_pytorch_max_split_size is True:
+    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:256'
 
   # Extend path according to framework.
   workload_metadata['workload_path'] = os.path.join(

From de45bf7fe4d90af16abc58bb685103722fbec44d Mon Sep 17 00:00:00 2001
From: priyakasimbeg <priyakasimbeg@users.noreply.github.com>
Date: Tue, 31 Oct 2023 23:46:49 +0000
Subject: [PATCH 40/52] add documentation

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index de8ea060d..289a93dec 100644
--- a/README.md
+++ b/README.md
@@ -246,6 +246,11 @@ The JAX and PyTorch versions of the Criteo, FastMRI, Librispeech, OGBG, and WMT
 Since we use PyTorch's [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel) implementation, there is one Python process for each device. Depending on the hardware and the settings of the cluster, running a TensorFlow input pipeline in each Python process can lead to errors, since too many threads are created in each process. See [this PR thread](https://github.com/mlcommons/algorithmic-efficiency/pull/85) for more details.
 While this issue might not affect all setups, we currently implement a different strategy: we only run the TensorFlow input pipeline in one Python process (with `rank == 0`), and [broadcast](https://pytorch.org/docs/stable/distributed.html#torch.distributed.broadcast) the batches to all other devices. This introduces an additional communication overhead for each batch. See the [implementation for the WMT workload](https://github.com/mlcommons/algorithmic-efficiency/blob/main/algorithmic_efficiency/workloads/wmt/wmt_pytorch/workload.py#L215-L288) as an example.
 
+## Pytorch Conformer CUDA OOM
+
+The conformer pytorch workload may run out of memory in current state. Please set the `submission_runner.py` flag `reduce_pytorch_max_split_size` to `True` as a temporary workaround if you encounter this issue. This will set 'max_split_size_mb:256'. Note that this will adversely impact the performance of the submission on this workload. See [tracking issue](https://github.com/mlcommons/algorithmic-efficiency/issues/497). 
+
+
 # FAQS
 
 ## Setup and Platform

From fa23fe840364ba54f081c4eccd3b52c1752e1744 Mon Sep 17 00:00:00 2001
From: priyakasimbeg <priyakasimbeg@users.noreply.github.com>
Date: Tue, 31 Oct 2023 23:50:46 +0000
Subject: [PATCH 41/52] formatting

---
 .../workloads/fastmri/fastmri_pytorch/workload.py  |  4 +---
 .../imagenet_resnet/imagenet_pytorch/workload.py   |  4 +---
 .../librispeech_jax/spectrum_augmenter.py          |  4 ++--
 .../librispeech_pytorch/workload.py                |  9 ++++-----
 algorithmic_efficiency/workloads/mnist/workload.py |  4 +---
 .../workloads/wmt/wmt_pytorch/models.py            |  4 ++--
 baselines/shampoo/jax/distributed_shampoo.py       | 12 ++++++------
 submission_runner.py                               | 14 ++++++--------
 8 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/algorithmic_efficiency/workloads/fastmri/fastmri_pytorch/workload.py b/algorithmic_efficiency/workloads/fastmri/fastmri_pytorch/workload.py
index daaea9e10..c3252feb8 100644
--- a/algorithmic_efficiency/workloads/fastmri/fastmri_pytorch/workload.py
+++ b/algorithmic_efficiency/workloads/fastmri/fastmri_pytorch/workload.py
@@ -247,9 +247,7 @@ def _eval_model_on_split(self,
     for _ in range(num_batches):
       batch = next(self._eval_iters[split])
       batch_metrics = self._eval_model(params, batch, model_rng)
-      total_metrics = {
-          k: v + batch_metrics[k] for k, v in total_metrics.items()
-      }
+      total_metrics = {k: v + batch_metrics[k] for k, v in total_metrics.items()}
     if USE_PYTORCH_DDP:
       for metric in total_metrics.values():
         dist.all_reduce(metric)
diff --git a/algorithmic_efficiency/workloads/imagenet_resnet/imagenet_pytorch/workload.py b/algorithmic_efficiency/workloads/imagenet_resnet/imagenet_pytorch/workload.py
index c0fcaaef3..cc9d2febc 100644
--- a/algorithmic_efficiency/workloads/imagenet_resnet/imagenet_pytorch/workload.py
+++ b/algorithmic_efficiency/workloads/imagenet_resnet/imagenet_pytorch/workload.py
@@ -282,9 +282,7 @@ def _eval_model_on_split(self,
           update_batch_norm=False)
       weights = batch.get('weights')
       batch_metrics = self._compute_metrics(logits, batch['targets'], weights)
-      total_metrics = {
-          k: v + batch_metrics[k] for k, v in total_metrics.items()
-      }
+      total_metrics = {k: v + batch_metrics[k] for k, v in total_metrics.items()}
     if USE_PYTORCH_DDP:
       for metric in total_metrics.values():
         dist.all_reduce(metric)
diff --git a/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_jax/spectrum_augmenter.py b/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_jax/spectrum_augmenter.py
index 2a6f73d4d..c16740629 100644
--- a/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_jax/spectrum_augmenter.py
+++ b/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_jax/spectrum_augmenter.py
@@ -81,8 +81,8 @@ def _get_mask(self,
           jnp.expand_dims(jnp.arange(multiplicity, dtype=jnp.int32), 0),
           [batch_size, 1])
       multiplicity_tensor = masks_per_frame * choose_range
-      multiplicity_weights = (multiplicity_weights <
-                              multiplicity_tensor).astype(jnp.int32)
+      multiplicity_weights = (multiplicity_weights
+                              < multiplicity_tensor).astype(jnp.int32)
       pre_mask = jnp.einsum('bmt,bm->bt', pre_mask, multiplicity_weights)
     else:
       pre_mask = jnp.einsum('bmt->bt', pre_mask)
diff --git a/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_pytorch/workload.py b/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_pytorch/workload.py
index c4f4a1247..d2774d3b9 100644
--- a/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_pytorch/workload.py
+++ b/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_pytorch/workload.py
@@ -227,8 +227,9 @@ def greedy_decode(
     idxs = torch.arange(
         fin_result.numel(), device=result.device).view(*fin_result.shape)
     mask = torch.arange(
-        fin_result.shape[1], device=result.device).view(
-            1, -1) < result.count_nonzero(dim=1).view(-1, 1)
+        fin_result.shape[1],
+        device=result.device).view(1, -1) < result.count_nonzero(dim=1).view(
+            -1, 1)
     fin_result.view(-1)[idxs[mask != 0]] = result[result != blank_id]
     padding = fin_result == 0
     return fin_result, padding
@@ -296,9 +297,7 @@ def _eval_model_on_split(self,
           'word_errors': word_errors,
           'num_words': num_words,
       }
-      total_metrics = {
-          k: v + batch_metrics[k] for k, v in total_metrics.items()
-      }
+      total_metrics = {k: v + batch_metrics[k] for k, v in total_metrics.items()}
     if USE_PYTORCH_DDP:
       for metric in total_metrics.values():
         dist.all_reduce(metric)
diff --git a/algorithmic_efficiency/workloads/mnist/workload.py b/algorithmic_efficiency/workloads/mnist/workload.py
index dcc195170..959228755 100644
--- a/algorithmic_efficiency/workloads/mnist/workload.py
+++ b/algorithmic_efficiency/workloads/mnist/workload.py
@@ -214,8 +214,6 @@ def _eval_model_on_split(self,
                                        batch,
                                        model_state,
                                        per_device_model_rngs)
-      total_metrics = {
-          k: v + batch_metrics[k] for k, v in total_metrics.items()
-      }
+      total_metrics = {k: v + batch_metrics[k] for k, v in total_metrics.items()}
 
     return self._normalize_eval_metrics(num_examples, total_metrics)
diff --git a/algorithmic_efficiency/workloads/wmt/wmt_pytorch/models.py b/algorithmic_efficiency/workloads/wmt/wmt_pytorch/models.py
index b787785a1..dc8ebea90 100644
--- a/algorithmic_efficiency/workloads/wmt/wmt_pytorch/models.py
+++ b/algorithmic_efficiency/workloads/wmt/wmt_pytorch/models.py
@@ -912,8 +912,8 @@ def forward(self,
       # not the remaining zero elements.
       if attn_mask is not None:
         raise ValueError('Attention mask has to be None for decode == True.')
-      attn_mask = (torch.arange(max_len, device=k.device) >=
-                   cache_index).reshape(1, max_len)
+      attn_mask = (torch.arange(max_len, device=k.device)
+                   >= cache_index).reshape(1, max_len)
 
     # Update sequence length to account for complete sequence.
     seq_len = k.size(1)
diff --git a/baselines/shampoo/jax/distributed_shampoo.py b/baselines/shampoo/jax/distributed_shampoo.py
index 725529cae..21f088c1b 100644
--- a/baselines/shampoo/jax/distributed_shampoo.py
+++ b/baselines/shampoo/jax/distributed_shampoo.py
@@ -595,8 +595,8 @@ def matrix_inverse_pth_root(
 
   if padding_start is not None:
     # Zero out padding in identity as well for convergence checks.
-    ix = (jnp.arange(matrix_size, dtype=jnp.int32) < padding_start).astype(
-        matrix.dtype)
+    ix = (jnp.arange(matrix_size, dtype=jnp.int32)
+          < padding_start).astype(matrix.dtype)
     matrix *= ix[jnp.newaxis, :]
     matrix *= ix[:, jnp.newaxis]
     identity *= ix
@@ -815,8 +815,8 @@ def matrix_inverse_pth_root_eigh(
   alpha = jnp.asarray(-1.0 / p, _MAT_INV_PTH_ROOT_DTYPE)
   identity = jnp.eye(matrix_size, dtype=_MAT_INV_PTH_ROOT_DTYPE)
   if padding_start is not None:
-    ix = (jnp.arange(matrix_size, dtype=jnp.int32) < padding_start).astype(
-        matrix.dtype)
+    ix = (jnp.arange(matrix_size, dtype=jnp.int32)
+          < padding_start).astype(matrix.dtype)
     matrix *= ix[jnp.newaxis, :]
     matrix *= ix[:, jnp.newaxis]
     identity *= ix
@@ -1923,8 +1923,8 @@ def _internal_inverse_pth_root_all():
     errors = metrics.inverse_pth_root_errors
     errors = errors.reshape((-1, 1, 1))
     predicate = jnp.logical_or(
-        jnp.isnan(errors),
-        errors >= inverse_failure_threshold).astype(new_preconditioners.dtype)
+        jnp.isnan(errors), errors
+        >= inverse_failure_threshold).astype(new_preconditioners.dtype)
     # TODO(rohananil): Check for numerical instabilities.
     new_conditional_preconditioners = (
         predicate * global_stats.preconditioners +
diff --git a/submission_runner.py b/submission_runner.py
index 6d4cc98e2..fc826b407 100644
--- a/submission_runner.py
+++ b/submission_runner.py
@@ -149,11 +149,9 @@
     None,
     'Value of rng seed. If None, a random seed will'
     'be generated from hardware.')
-flags.DEFINE_boolean(
-    'set_pytorch_max_split_size', 
-    None,
-    'If true, set pytorch max_split_size_mb to 256'
-)
+flags.DEFINE_boolean('set_pytorch_max_split_size',
+                     None,
+                     'If true, set pytorch max_split_size_mb to 256')
 FLAGS = flags.FLAGS
 USE_PYTORCH_DDP, RANK, DEVICE, N_GPUS = pytorch_setup()
 
@@ -352,8 +350,8 @@ def train_once(
     train_state['is_time_remaining'] = (
         train_state['accumulated_submission_time'] < max_allowed_runtime_sec)
     # Check if submission is eligible for an untimed eval.
-    if ((train_step_end_time - train_state['last_eval_time']) >=
-        workload.eval_period_time_sec or train_state['training_complete']):
+    if ((train_step_end_time - train_state['last_eval_time'])
+        >= workload.eval_period_time_sec or train_state['training_complete']):
       with profiler.profile('Evaluation'):
         del batch
         _reset_cuda_mem()
@@ -606,7 +604,7 @@ def main(_):
   # Prevent OOM on librispeech conformer.
   if FLAGS.workload == 'librispeech_conformer':
     os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '0.85'
-  
+
   if FLAGS.set_pytorch_max_split_size is True:
     os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:256'
 

From 9b958c36f8630042ca198ae3451239b5832dd90a Mon Sep 17 00:00:00 2001
From: priyakasimbeg <priyakasimbeg@users.noreply.github.com>
Date: Tue, 31 Oct 2023 23:54:37 +0000
Subject: [PATCH 42/52] revert formatting

---
 README.md                                                    | 5 -----
 .../workloads/fastmri/fastmri_pytorch/workload.py            | 4 +++-
 .../workloads/imagenet_resnet/imagenet_pytorch/workload.py   | 4 +++-
 .../librispeech_jax/spectrum_augmenter.py                    | 4 ++--
 .../librispeech_conformer/librispeech_pytorch/workload.py    | 5 ++---
 algorithmic_efficiency/workloads/mnist/workload.py           | 4 +++-
 algorithmic_efficiency/workloads/wmt/wmt_pytorch/models.py   | 4 ++--
 baselines/shampoo/jax/distributed_shampoo.py                 | 4 ++--
 submission_runner.py                                         | 4 ++--
 9 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 289a93dec..197ba8a61 100644
--- a/README.md
+++ b/README.md
@@ -126,13 +126,8 @@ To use the Docker container as an interactive virtual environment, you can run a
       -v $HOME/algorithmic-efficiency:/algorithmic-efficiency \
       --gpus all \
       --ipc=host \
-<<<<<<< HEAD
-      <docker_image_name> \
-      -keep_container_alive true
-=======
       <docker_image_name> \
       --keep_container_alive true
->>>>>>> ba5c6f6175a0ce12f23a7f035613d9d1edc0b74a
    ```
    Note: You may have to use double quotes around `algorithmic-efficiency` [path] in the mounting `-v` flag. If the above command fails try replacing the following line:
    ```bash
diff --git a/algorithmic_efficiency/workloads/fastmri/fastmri_pytorch/workload.py b/algorithmic_efficiency/workloads/fastmri/fastmri_pytorch/workload.py
index c3252feb8..daaea9e10 100644
--- a/algorithmic_efficiency/workloads/fastmri/fastmri_pytorch/workload.py
+++ b/algorithmic_efficiency/workloads/fastmri/fastmri_pytorch/workload.py
@@ -247,7 +247,9 @@ def _eval_model_on_split(self,
     for _ in range(num_batches):
       batch = next(self._eval_iters[split])
       batch_metrics = self._eval_model(params, batch, model_rng)
-      total_metrics = {k: v + batch_metrics[k] for k, v in total_metrics.items()}
+      total_metrics = {
+          k: v + batch_metrics[k] for k, v in total_metrics.items()
+      }
     if USE_PYTORCH_DDP:
       for metric in total_metrics.values():
         dist.all_reduce(metric)
diff --git a/algorithmic_efficiency/workloads/imagenet_resnet/imagenet_pytorch/workload.py b/algorithmic_efficiency/workloads/imagenet_resnet/imagenet_pytorch/workload.py
index cc9d2febc..c0fcaaef3 100644
--- a/algorithmic_efficiency/workloads/imagenet_resnet/imagenet_pytorch/workload.py
+++ b/algorithmic_efficiency/workloads/imagenet_resnet/imagenet_pytorch/workload.py
@@ -282,7 +282,9 @@ def _eval_model_on_split(self,
           update_batch_norm=False)
       weights = batch.get('weights')
       batch_metrics = self._compute_metrics(logits, batch['targets'], weights)
-      total_metrics = {k: v + batch_metrics[k] for k, v in total_metrics.items()}
+      total_metrics = {
+          k: v + batch_metrics[k] for k, v in total_metrics.items()
+      }
     if USE_PYTORCH_DDP:
       for metric in total_metrics.values():
         dist.all_reduce(metric)
diff --git a/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_jax/spectrum_augmenter.py b/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_jax/spectrum_augmenter.py
index c16740629..2a6f73d4d 100644
--- a/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_jax/spectrum_augmenter.py
+++ b/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_jax/spectrum_augmenter.py
@@ -81,8 +81,8 @@ def _get_mask(self,
           jnp.expand_dims(jnp.arange(multiplicity, dtype=jnp.int32), 0),
           [batch_size, 1])
       multiplicity_tensor = masks_per_frame * choose_range
-      multiplicity_weights = (multiplicity_weights
-                              < multiplicity_tensor).astype(jnp.int32)
+      multiplicity_weights = (multiplicity_weights <
+                              multiplicity_tensor).astype(jnp.int32)
       pre_mask = jnp.einsum('bmt,bm->bt', pre_mask, multiplicity_weights)
     else:
       pre_mask = jnp.einsum('bmt->bt', pre_mask)
diff --git a/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_pytorch/workload.py b/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_pytorch/workload.py
index d2774d3b9..167332ed0 100644
--- a/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_pytorch/workload.py
+++ b/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_pytorch/workload.py
@@ -227,9 +227,8 @@ def greedy_decode(
     idxs = torch.arange(
         fin_result.numel(), device=result.device).view(*fin_result.shape)
     mask = torch.arange(
-        fin_result.shape[1],
-        device=result.device).view(1, -1) < result.count_nonzero(dim=1).view(
-            -1, 1)
+        fin_result.shape[1], device=result.device).view(
+            1, -1) < result.count_nonzero(dim=1).view(-1, 1)
     fin_result.view(-1)[idxs[mask != 0]] = result[result != blank_id]
     padding = fin_result == 0
     return fin_result, padding
diff --git a/algorithmic_efficiency/workloads/mnist/workload.py b/algorithmic_efficiency/workloads/mnist/workload.py
index 959228755..dcc195170 100644
--- a/algorithmic_efficiency/workloads/mnist/workload.py
+++ b/algorithmic_efficiency/workloads/mnist/workload.py
@@ -214,6 +214,8 @@ def _eval_model_on_split(self,
                                        batch,
                                        model_state,
                                        per_device_model_rngs)
-      total_metrics = {k: v + batch_metrics[k] for k, v in total_metrics.items()}
+      total_metrics = {
+          k: v + batch_metrics[k] for k, v in total_metrics.items()
+      }
 
     return self._normalize_eval_metrics(num_examples, total_metrics)
diff --git a/algorithmic_efficiency/workloads/wmt/wmt_pytorch/models.py b/algorithmic_efficiency/workloads/wmt/wmt_pytorch/models.py
index dc8ebea90..b787785a1 100644
--- a/algorithmic_efficiency/workloads/wmt/wmt_pytorch/models.py
+++ b/algorithmic_efficiency/workloads/wmt/wmt_pytorch/models.py
@@ -912,8 +912,8 @@ def forward(self,
       # not the remaining zero elements.
       if attn_mask is not None:
         raise ValueError('Attention mask has to be None for decode == True.')
-      attn_mask = (torch.arange(max_len, device=k.device)
-                   >= cache_index).reshape(1, max_len)
+      attn_mask = (torch.arange(max_len, device=k.device) >=
+                   cache_index).reshape(1, max_len)
 
     # Update sequence length to account for complete sequence.
     seq_len = k.size(1)
diff --git a/baselines/shampoo/jax/distributed_shampoo.py b/baselines/shampoo/jax/distributed_shampoo.py
index 21f088c1b..225454b2c 100644
--- a/baselines/shampoo/jax/distributed_shampoo.py
+++ b/baselines/shampoo/jax/distributed_shampoo.py
@@ -595,8 +595,8 @@ def matrix_inverse_pth_root(
 
   if padding_start is not None:
     # Zero out padding in identity as well for convergence checks.
-    ix = (jnp.arange(matrix_size, dtype=jnp.int32)
-          < padding_start).astype(matrix.dtype)
+    ix = (jnp.arange(matrix_size, dtype=jnp.int32) < padding_start).astype(
+        matrix.dtype)
     matrix *= ix[jnp.newaxis, :]
     matrix *= ix[:, jnp.newaxis]
     identity *= ix
diff --git a/submission_runner.py b/submission_runner.py
index fc826b407..a40e2090b 100644
--- a/submission_runner.py
+++ b/submission_runner.py
@@ -350,8 +350,8 @@ def train_once(
     train_state['is_time_remaining'] = (
         train_state['accumulated_submission_time'] < max_allowed_runtime_sec)
     # Check if submission is eligible for an untimed eval.
-    if ((train_step_end_time - train_state['last_eval_time'])
-        >= workload.eval_period_time_sec or train_state['training_complete']):
+    if ((train_step_end_time - train_state['last_eval_time']) >=
+        workload.eval_period_time_sec or train_state['training_complete']):
       with profiler.profile('Evaluation'):
         del batch
         _reset_cuda_mem()

From ec876fa045079fbbfad924acf528b3eead257248 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Thu, 2 Nov 2023 17:41:35 +0000
Subject: [PATCH 43/52] formatting

---
 .../librispeech_conformer/librispeech_pytorch/workload.py     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_pytorch/workload.py b/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_pytorch/workload.py
index 167332ed0..c4f4a1247 100644
--- a/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_pytorch/workload.py
+++ b/algorithmic_efficiency/workloads/librispeech_conformer/librispeech_pytorch/workload.py
@@ -296,7 +296,9 @@ def _eval_model_on_split(self,
           'word_errors': word_errors,
           'num_words': num_words,
       }
-      total_metrics = {k: v + batch_metrics[k] for k, v in total_metrics.items()}
+      total_metrics = {
+          k: v + batch_metrics[k] for k, v in total_metrics.items()
+      }
     if USE_PYTORCH_DDP:
       for metric in total_metrics.values():
         dist.all_reduce(metric)

From 691e2c81ab2821531a6a90b89cb88703a363518f Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Thu, 2 Nov 2023 18:20:03 +0000
Subject: [PATCH 44/52] nits

---
 baselines/shampoo/jax/distributed_shampoo.py | 8 ++++----
 submission_runner.py                         | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/baselines/shampoo/jax/distributed_shampoo.py b/baselines/shampoo/jax/distributed_shampoo.py
index 225454b2c..725529cae 100644
--- a/baselines/shampoo/jax/distributed_shampoo.py
+++ b/baselines/shampoo/jax/distributed_shampoo.py
@@ -815,8 +815,8 @@ def matrix_inverse_pth_root_eigh(
   alpha = jnp.asarray(-1.0 / p, _MAT_INV_PTH_ROOT_DTYPE)
   identity = jnp.eye(matrix_size, dtype=_MAT_INV_PTH_ROOT_DTYPE)
   if padding_start is not None:
-    ix = (jnp.arange(matrix_size, dtype=jnp.int32)
-          < padding_start).astype(matrix.dtype)
+    ix = (jnp.arange(matrix_size, dtype=jnp.int32) < padding_start).astype(
+        matrix.dtype)
     matrix *= ix[jnp.newaxis, :]
     matrix *= ix[:, jnp.newaxis]
     identity *= ix
@@ -1923,8 +1923,8 @@ def _internal_inverse_pth_root_all():
     errors = metrics.inverse_pth_root_errors
     errors = errors.reshape((-1, 1, 1))
     predicate = jnp.logical_or(
-        jnp.isnan(errors), errors
-        >= inverse_failure_threshold).astype(new_preconditioners.dtype)
+        jnp.isnan(errors),
+        errors >= inverse_failure_threshold).astype(new_preconditioners.dtype)
     # TODO(rohananil): Check for numerical instabilities.
     new_conditional_preconditioners = (
         predicate * global_stats.preconditioners +
diff --git a/submission_runner.py b/submission_runner.py
index a40e2090b..d92732145 100644
--- a/submission_runner.py
+++ b/submission_runner.py
@@ -150,7 +150,7 @@
     'Value of rng seed. If None, a random seed will'
     'be generated from hardware.')
 flags.DEFINE_boolean('set_pytorch_max_split_size',
-                     None,
+                     False,
                      'If true, set pytorch max_split_size_mb to 256')
 FLAGS = flags.FLAGS
 USE_PYTORCH_DDP, RANK, DEVICE, N_GPUS = pytorch_setup()
@@ -605,7 +605,7 @@ def main(_):
   if FLAGS.workload == 'librispeech_conformer':
     os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '0.85'
 
-  if FLAGS.set_pytorch_max_split_size is True:
+  if FLAGS.set_pytorch_max_split_size:
     os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:256'
 
   # Extend path according to framework.

From 1cb02ca0c2e31165241136b952b4c266f49c039e Mon Sep 17 00:00:00 2001
From: priyakasimbeg <priyakasimbeg@users.noreply.github.com>
Date: Thu, 2 Nov 2023 15:19:55 -0700
Subject: [PATCH 45/52] remove tabulate import

---
 scoring/scoring.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scoring/scoring.py b/scoring/scoring.py
index 0c076bea4..7e52bd08c 100644
--- a/scoring/scoring.py
+++ b/scoring/scoring.py
@@ -34,7 +34,6 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from tabulate import tabulate
 
 import algorithmic_efficiency.workloads.workloads as workloads_registry
 

From dfb7701d6c92d58f651ca929881b1957b76c9991 Mon Sep 17 00:00:00 2001
From: priyakasimbeg <priyakasimbeg@users.noreply.github.com>
Date: Thu, 2 Nov 2023 16:31:53 -0700
Subject: [PATCH 46/52] Update README.md

---
 datasets/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/README.md b/datasets/README.md
index 5ff0e18a7..586895022 100644
--- a/datasets/README.md
+++ b/datasets/README.md
@@ -28,7 +28,7 @@ make sure the data directory is mounted to a directory on your host with
 -v flag. If you are following instructions from the README you will have used 
 the `-v $HOME/data:/data` flag in the `docker run` command. This will mount
 the `$HOME/data` directory to the `/data` directory in the container. 
-In this case set --data_dir to  `\data`. 
+In this case set --data_dir to  `/data`. 
 ```bash
 DATA_DIR='/data'
 ```

From 9bbd933344fb7ea2a5eb54f124268deb67cebaba Mon Sep 17 00:00:00 2001
From: runame <re393@cam.ac.uk>
Date: Fri, 3 Nov 2023 17:34:44 +0100
Subject: [PATCH 47/52] Remove tabulate requirement

---
 setup.cfg | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 4c2d9e6d3..a7ce5ebb2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,7 +37,6 @@ install_requires =
   absl-py==1.4.0
   numpy>=1.23
   pandas>=2.0.1
-  tabulate==0.9.0
   tensorflow==2.12.0
   tensorflow-datasets==4.9.2
   tensorflow-probability==0.20.0

From ea2e7fcf4d5555a4bf6eb17cad16ce6cae9ff9d9 Mon Sep 17 00:00:00 2001
From: runame <re393@cam.ac.uk>
Date: Fri, 3 Nov 2023 17:37:21 +0100
Subject: [PATCH 48/52] Test warnings in get_experiment_df

---
 scoring/test_scoring_utils.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/scoring/test_scoring_utils.py b/scoring/test_scoring_utils.py
index b766a04d7..fbb21958c 100644
--- a/scoring/test_scoring_utils.py
+++ b/scoring/test_scoring_utils.py
@@ -1,8 +1,11 @@
 from absl.testing import absltest
-import scoring_utils
 
-TEST_LOGFILE = 'test_data/adamw_fastmri_jax_04-18-2023-13-10-58.log'
-TEST_DIR = 'test_data/experiment_dir'
+from scoring import scoring_utils
+from scoring.scoring import NUM_TRIALS
+from scoring.scoring import NUM_WORKLOADS
+
+TEST_LOGFILE = 'scoring/test_data/adamw_fastmri_jax_04-18-2023-13-10-58.log'
+TEST_DIR = 'scoring/test_data/experiment_dir'
 NUM_EVALS = 18
 
 
@@ -14,8 +17,7 @@ def test_get_trials_dict(self):
 
   def test_get_trials_df_dict(self):
     trials_dict = scoring_utils.get_trials_df_dict(TEST_LOGFILE)
-    for trial in trials_dict:
-      df = trials_dict[trial]
+    for df in trials_dict.values():
       self.assertEqual(len(df.index), NUM_EVALS)
 
   def test_get_trials_df(self):
@@ -24,7 +26,18 @@ def test_get_trials_df(self):
       self.assertEqual(len(df.at['1', column]), NUM_EVALS)
 
   def test_get_experiment_df(self):
-    df = scoring_utils.get_experiment_df(TEST_DIR)
+    _ = scoring_utils.get_experiment_df(TEST_DIR)
+    self.assertWarnsRegex(
+        Warning,
+        f'There should be {NUM_WORKLOADS} workloads but there are 1.',
+        scoring_utils.get_experiment_df,
+        TEST_DIR)
+    self.assertWarnsRegex(
+        Warning,
+        f'There should be {NUM_TRIALS} trials for workload mnist_jax but there '
+        'are only 1.',
+        scoring_utils.get_experiment_df,
+        TEST_DIR)
 
 
 if __name__ == '__main__':

From 74b961b8a01029c0e7b771b7f6965c528a6b57b2 Mon Sep 17 00:00:00 2001
From: runame <re393@cam.ac.uk>
Date: Fri, 3 Nov 2023 17:38:42 +0100
Subject: [PATCH 49/52] Add warnings when not all workloads or trials are
 present

---
 scoring/scoring_utils.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py
index 37db73dd4..1a15db2f5 100644
--- a/scoring/scoring_utils.py
+++ b/scoring/scoring_utils.py
@@ -1,10 +1,14 @@
 import json
 import os
 import re
+import warnings
 
 from absl import logging
 import pandas as pd
 
+from scoring.scoring import NUM_TRIALS
+from scoring.scoring import NUM_WORKLOADS
+
 TRIAL_LINE_REGEX = '(.*) --- Tuning run (\d+)/(\d+) ---'
 METRICS_LINE_REGEX = '(.*) Metrics: ({.*})'
 TRIAL_DIR_REGEX = 'trial_(\d+)'
@@ -103,8 +107,7 @@ def get_trials_df_dict(logfile):
     """
   trials_dict = get_trials_dict(logfile)
   trials_df_dict = {}
-  for trial in trials_dict.keys():
-    metrics = trials_dict[trial]
+  for trial, metrics in trials_dict.items():
     trials_df_dict[trial] = pd.DataFrame(metrics)
   return trials_df_dict
 
@@ -156,6 +159,10 @@ def get_experiment_df(experiment_dir):
   """
   df = pd.DataFrame()
   workload_dirs = os.listdir(experiment_dir)
+  num_workloads = len(workload_dirs)
+  if num_workloads != NUM_WORKLOADS:
+    warnings.warn(f'There should be {NUM_WORKLOADS} workloads but there are '
+                  f'{num_workloads}.')
   for workload in workload_dirs:
     data = {
         'workload': workload,
@@ -164,6 +171,7 @@ def get_experiment_df(experiment_dir):
         t for t in os.listdir(os.path.join(experiment_dir, workload))
         if re.match(TRIAL_DIR_REGEX, t)
     ]
+    workload_df = pd.DataFrame()
     for trial in trial_dirs:
       eval_measurements_filepath = os.path.join(
           experiment_dir,
@@ -173,7 +181,7 @@ def get_experiment_df(experiment_dir):
       )
       try:
         trial_df = pd.read_csv(eval_measurements_filepath)
-      except FileNotFoundError as e:
+      except FileNotFoundError:
         logging.info(f'Could not read {eval_measurements_filepath}')
         continue
       data['trial'] = trial
@@ -181,5 +189,10 @@ def get_experiment_df(experiment_dir):
         values = trial_df[column].to_numpy()
         data[column] = values
       trial_df = pd.DataFrame([data])
-      df = pd.concat([df, trial_df], ignore_index=True)
+      workload_df = pd.concat([workload_df, trial_df], ignore_index=True)
+    num_trials = len(workload_df)
+    if num_trials != NUM_TRIALS:
+      warnings.warn(f'There should be {NUM_TRIALS} trials for workload '
+                    f'{workload} but there are only {num_trials}.')
+    df = pd.concat([df, workload_df], ignore_index=True)
   return df

From c3a6f43428619622238257e3b3b5817086d10a04 Mon Sep 17 00:00:00 2001
From: runame <re393@cam.ac.uk>
Date: Fri, 3 Nov 2023 17:39:42 +0100
Subject: [PATCH 50/52] Fix bugs in scoring calculation

---
 scoring/scoring.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/scoring/scoring.py b/scoring/scoring.py
index 7e52bd08c..dba254233 100644
--- a/scoring/scoring.py
+++ b/scoring/scoring.py
@@ -40,6 +40,12 @@
 WORKLOADS = workloads_registry.WORKLOADS
 WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)'
 BASE_WORKLOADS_DIR = 'algorithmic_efficiency/workloads/'
+# These global variables have to be set according to the current set of
+# workloads and rules for the scoring to be correct.
+# We do not use the workload registry since it contains test and development
+# workloads as well.
+NUM_WORKLOADS = 8
+NUM_TRIALS = 5
 
 MIN_EVAL_METRICS = [
     'ce_loss',
@@ -133,9 +139,10 @@ def get_index_that_reaches_target(workload_df,
   # Remove trials that never reach the target
   target_reached = target_reached[target_reached.apply(np.any)]
 
-  # If we have no trials that have reached the target, return -1. Else, return
-  # the eval index of the earliest point the target is reached.
-  if target_reached.empty:
+  # If less than 3 trials reach the target, the submission will be scored as
+  # missing the target on this workload; return -1. Else, return the eval index
+  # of the earliest point the target is reached.
+  if len(target_reached) < 3:
     return -1, -1
   else:
     index_reached = target_reached.apply(np.argmax)
@@ -287,7 +294,7 @@ def compute_performance_profiles(results,
         np.log10(min_tau), np.log10(max_tau), num=num_points, base=10.0)
 
   def rho(r, tau):
-    return (r <= tau).sum(axis=1) / len(r.columns)
+    return (r <= tau).sum(axis=1) / NUM_WORKLOADS
 
   perf_df = pd.concat([rho(df, tau) for tau in points], axis=1)
 

From 4151e09c43d28a7431f2844603161a74a5469e3f Mon Sep 17 00:00:00 2001
From: runame <re393@cam.ac.uk>
Date: Fri, 3 Nov 2023 17:40:19 +0100
Subject: [PATCH 51/52] Fix imports

---
 scoring/score_submission.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scoring/score_submission.py b/scoring/score_submission.py
index 42a605dac..e8a6ac010 100644
--- a/scoring/score_submission.py
+++ b/scoring/score_submission.py
@@ -5,8 +5,7 @@
 from absl import logging
 import scoring_utils
 
-from algorithmic_efficiency import workloads
-import scoring
+from scoring import scoring
 
 flags.DEFINE_string(
     'experiment_path',

From d0551020fc31f3855570e218916f0b97a7a0eb78 Mon Sep 17 00:00:00 2001
From: runame <re393@cam.ac.uk>
Date: Sat, 4 Nov 2023 17:41:57 +0100
Subject: [PATCH 52/52] Remove unused hparam from ogbg target-setting run
 config

---
 .../target_setting_algorithms/ogbg/tuning_search_space.json  | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/reference_algorithms/target_setting_algorithms/ogbg/tuning_search_space.json b/reference_algorithms/target_setting_algorithms/ogbg/tuning_search_space.json
index 0ca3b935d..0f365a183 100644
--- a/reference_algorithms/target_setting_algorithms/ogbg/tuning_search_space.json
+++ b/reference_algorithms/target_setting_algorithms/ogbg/tuning_search_space.json
@@ -9,11 +9,6 @@
             0.9449369031171744
         ]
     },
-    "beta2": {
-        "feasible_points": [
-            0.9978504782314613
-        ]
-    },
     "warmup_steps": {
         "feasible_points": [
             3000