From c4c72324b307af4becb4f1c5428c8bc3293cda69 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Sat, 4 Nov 2023 01:04:09 +0000
Subject: [PATCH 1/8] rename scoring module to fix import errors

---
 scoring/{scoring.py => score_profile.py} | 24 ++++++++++-----
 scoring/score_submission.py              | 38 ++++++++++++++----------
 scoring/scoring_utils.py                 | 19 +++++-------
 setup.cfg                                |  1 +
 4 files changed, 47 insertions(+), 35 deletions(-)
 rename scoring/{scoring.py => score_profile.py} (94%)

diff --git a/scoring/scoring.py b/scoring/score_profile.py
similarity index 94%
rename from scoring/scoring.py
rename to scoring/score_profile.py
index dba254233..713ac87d2 100644
--- a/scoring/scoring.py
+++ b/scoring/score_profile.py
@@ -25,7 +25,7 @@
   The keys in this dictionary should match the workload identifiers used in
   the dictionary of submissions.
 """
-
+from absl import logging
 import itertools
 import operator
 import os
@@ -153,7 +153,8 @@ def get_index_that_reaches_target(workload_df,
 def get_times_for_submission(submission,
                              submission_tag,
                              time_col='global_step',
-                             verbosity=1):
+                             verbosity=1,
+                             self_tuning_ruleset=False):
   """Get times to target for each workload in a submission.
 
   Args:
@@ -168,8 +169,15 @@ def get_times_for_submission(submission,
   """
   workloads = []
   submission_name = submission_tag.split('.')[1]
-
+  num_workloads = len(submission.groupby('workload'))
+  if num_workloads != NUM_WORKLOADS:
+    logging.warning(f'Expecting {NUM_WORKLOADS} workloads '
+                    f'but found {num_workloads} workloads.')
   for workload, group in submission.groupby('workload'):
+    num_trials = len(group)
+    if num_trials != NUM_TRIALS and not self_tuning_ruleset:
+      logging.warning(f'Expecting {NUM_TRIALS} trials for workload '
+                    f'{workload} but found {num_trials} trials.')
     workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1)
     framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2)
     workload_metadata = WORKLOADS[workload_name]
@@ -250,21 +258,21 @@ def compute_performance_profiles(results,
   dfs = []
 
   for submission_tag, result in results.items():
-    print(f'\nComputing performance profile with respect to `{time_col}` for '
+    logging.info(f'\nComputing performance profile with respect to `{time_col}` for '
           f'{submission_tag}')
     dfs.append(
         get_times_for_submission(result, submission_tag, time_col, verbosity))
   df = pd.concat(dfs)
 
   if verbosity > 0:
-    print(f'\n`{time_col}` to reach target:')
+    logging.info('\n`{time_col}` to reach target:')
     with pd.option_context('display.max_rows',
                            None,
                            'display.max_columns',
                            None,
                            'display.width',
                            1000):
-      print(df)
+      logging.info(df)
 
   # Divide by the fastest.
   if reference_submission_tag is None:
@@ -273,14 +281,14 @@ def compute_performance_profiles(results,
     df.update(df.div(df.loc[reference_submission_tag, :], axis=1))
 
   if verbosity > 0:
-    print(f'\n`{time_col}` to reach target normalized to best:')
+    logging.info('\n`{time_col}` to reach target normalized to best:')
     with pd.option_context('display.max_rows',
                            None,
                            'display.max_columns',
                            None,
                            'display.width',
                            1000):
-      print(df)
+      logging.info(df)
 
   # If no max_tau is supplied, choose the value of tau that would plot all non
   # inf or nan data.
diff --git a/scoring/score_submission.py b/scoring/score_submission.py
index e8a6ac010..211de3db6 100644
--- a/scoring/score_submission.py
+++ b/scoring/score_submission.py
@@ -5,7 +5,8 @@
 from absl import logging
 import scoring_utils
 
-from scoring import scoring
+from scoring import score_profile
+import tabulate
 
 flags.DEFINE_string(
     'experiment_path',
@@ -15,6 +16,9 @@
 flags.DEFINE_string('output_dir',
                     'scoring_results',
                     'Path to save performance profile table and plot.')
+flags.DEFINE_boolean('compute_performance_profiles',
+                    True, 
+                    'Whether or not to compute the performance profiles.')
 FLAGS = flags.FLAGS
 
 
@@ -23,21 +27,23 @@ def main(_):
   results = {
       FLAGS.submission_tag: df,
   }
-  performance_profile_df = scoring.compute_performance_profiles(
-      results,
-      time_col='score',
-      min_tau=1.0,
-      max_tau=None,
-      reference_submission_tag=None,
-      num_points=100,
-      scale='linear',
-      verbosity=0)
-  if not os.path.exists(FLAGS.output_dir):
-    os.mkdir(FLAGS.output_dir)
-  scoring.plot_performance_profiles(
-      performance_profile_df, 'score', save_dir=FLAGS.output_dir)
-
-  logging.info(performance_profile_df)
+
+  if FLAGS.compute_performance_profiles:
+    performance_profile_df = score_profile.compute_performance_profiles(
+        results,
+        time_col='score',
+        min_tau=1.0,
+        max_tau=None,
+        reference_submission_tag=None,
+        num_points=100,
+        scale='linear',
+        verbosity=0)
+    if not os.path.exists(FLAGS.output_dir):
+      os.mkdir(FLAGS.output_dir)
+    score_profile.plot_performance_profiles(
+        performance_profile_df, 'score', save_dir=FLAGS.output_dir)
+
+    logging.info(performance_profile_df)
 
 
 if __name__ == '__main__':
diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py
index 1a15db2f5..86b0c36b6 100644
--- a/scoring/scoring_utils.py
+++ b/scoring/scoring_utils.py
@@ -1,13 +1,14 @@
 import json
 import os
 import re
-import warnings
 
 from absl import logging
 import pandas as pd
 
-from scoring.scoring import NUM_TRIALS
-from scoring.scoring import NUM_WORKLOADS
+
+from algorithmic_efficiency import spec
+from scoring.score_profile import NUM_TRIALS
+from scoring.score_profile import NUM_WORKLOADS
 
 TRIAL_LINE_REGEX = '(.*) --- Tuning run (\d+)/(\d+) ---'
 METRICS_LINE_REGEX = '(.*) Metrics: ({.*})'
@@ -137,7 +138,7 @@ def get_trials_df(logfile):
 def get_experiment_df(experiment_dir):
   """Gets a df of per trial results from an experiment dir.
   The output df can be provided as input to 
-  scoring.compute_performance_profiles. 
+  score_profilecompute_performance_profiles. 
   Args:
       experiment_dir: path to experiment directory containing 
         results for workloads.
@@ -160,9 +161,9 @@ def get_experiment_df(experiment_dir):
   df = pd.DataFrame()
   workload_dirs = os.listdir(experiment_dir)
   num_workloads = len(workload_dirs)
-  if num_workloads != NUM_WORKLOADS:
-    warnings.warn(f'There should be {NUM_WORKLOADS} workloads but there are '
-                  f'{num_workloads}.')
+  # if num_workloads != NUM_WORKLOADS:
+  #   warnings.warn(f'There should be {NUM_WORKLOADS} workloads but there are '
+  #                 f'{num_workloads}.')
   for workload in workload_dirs:
     data = {
         'workload': workload,
@@ -190,9 +191,5 @@ def get_experiment_df(experiment_dir):
         data[column] = values
       trial_df = pd.DataFrame([data])
       workload_df = pd.concat([workload_df, trial_df], ignore_index=True)
-    num_trials = len(workload_df)
-    if num_trials != NUM_TRIALS:
-      warnings.warn(f'There should be {NUM_TRIALS} trials for workload '
-                    f'{workload} but there are only {num_trials}.')
     df = pd.concat([df, workload_df], ignore_index=True)
   return df
diff --git a/setup.cfg b/setup.cfg
index a7ce5ebb2..9aa4ffb5f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -45,6 +45,7 @@ install_requires =
   psutil==5.9.5
   clu==0.0.7
   matplotlib>=3.7.2
+  tabulate==0.9.0
 python_requires = >=3.8
 
 

From 26f962c2780578992e5ead74c41dd93537660e99 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Sat, 4 Nov 2023 02:06:35 +0000
Subject: [PATCH 2/8] refactor scoring

---
 scoring/score_profile.py    | 20 +++-----------------
 scoring/score_submission.py | 15 +++++++++++----
 scoring/scoring_utils.py    | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/scoring/score_profile.py b/scoring/score_profile.py
index 713ac87d2..bf6f01298 100644
--- a/scoring/score_profile.py
+++ b/scoring/score_profile.py
@@ -34,6 +34,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import scoring_utils
 
 import algorithmic_efficiency.workloads.workloads as workloads_registry
 
@@ -178,23 +179,8 @@ def get_times_for_submission(submission,
     if num_trials != NUM_TRIALS and not self_tuning_ruleset:
       logging.warning(f'Expecting {NUM_TRIALS} trials for workload '
                     f'{workload} but found {num_trials} trials.')
-    workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1)
-    framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2)
-    workload_metadata = WORKLOADS[workload_name]
-
-    # Extend path according to framework.
-    workload_metadata['workload_path'] = os.path.join(
-        BASE_WORKLOADS_DIR,
-        workload_metadata['workload_path'] + f'{framework}',
-        'workload.py')
-    workload_init_kwargs = {}
-    workload_obj = workloads_registry.import_workload(
-        workload_path=workload_metadata['workload_path'],
-        workload_class_name=workload_metadata['workload_class_name'],
-        workload_init_kwargs=workload_init_kwargs)
-    metric_name = workload_obj.target_metric_name
-    validation_metric = f'validation/{metric_name}'
-    validation_target = workload_obj.validation_target_value
+    validation_metric, validation_target = scoring_utils.get_workload_validation_target(workload)
+    validation_target = validation_target
 
     trial_idx, time_idx = get_index_that_reaches_target(
         group, validation_metric, validation_target)
diff --git a/scoring/score_submission.py b/scoring/score_submission.py
index 211de3db6..bd8e44f3b 100644
--- a/scoring/score_submission.py
+++ b/scoring/score_submission.py
@@ -4,9 +4,8 @@
 from absl import flags
 from absl import logging
 import scoring_utils
-
+from tabulate import tabulate
 from scoring import score_profile
-import tabulate
 
 flags.DEFINE_string(
     'experiment_path',
@@ -27,6 +26,14 @@ def main(_):
   results = {
       FLAGS.submission_tag: df,
   }
+  table = tabulate(df, headers='keys', tablefmt='psql')
+  logging.info(df)
+  for workload, group in df.groupby('workload'):
+    target_metric_name, target_metric_value = scoring_utils.get_workload_validation_target(workload)
+    print(target_metric_name)
+    print(target_metric_value)
+    # print(workload)
+    # print(group)
 
   if FLAGS.compute_performance_profiles:
     performance_profile_df = score_profile.compute_performance_profiles(
@@ -42,8 +49,8 @@ def main(_):
       os.mkdir(FLAGS.output_dir)
     score_profile.plot_performance_profiles(
         performance_profile_df, 'score', save_dir=FLAGS.output_dir)
-
-    logging.info(performance_profile_df)
+    perf_df = tabulate(performance_profile_df.T, headers='keys', tablefmt='psql')
+    logging.info(f'Performance profile:\n {perf_df}')
 
 
 if __name__ == '__main__':
diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py
index 86b0c36b6..bc0703a4c 100644
--- a/scoring/scoring_utils.py
+++ b/scoring/scoring_utils.py
@@ -1,6 +1,7 @@
 import json
 import os
 import re
+import copy
 
 from absl import logging
 import pandas as pd
@@ -9,12 +10,16 @@
 from algorithmic_efficiency import spec
 from scoring.score_profile import NUM_TRIALS
 from scoring.score_profile import NUM_WORKLOADS
+import algorithmic_efficiency.workloads.workloads as workloads_registry
 
 TRIAL_LINE_REGEX = '(.*) --- Tuning run (\d+)/(\d+) ---'
 METRICS_LINE_REGEX = '(.*) Metrics: ({.*})'
 TRIAL_DIR_REGEX = 'trial_(\d+)'
 MEASUREMENTS_FILENAME = 'eval_measurements.csv'
 
+WORKLOADS = workloads_registry.WORKLOADS
+WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)'
+BASE_WORKLOADS_DIR = 'algorithmic_efficiency/workloads/'
 
 #### File IO helper functions ###
 def get_logfile_paths(logdir):
@@ -193,3 +198,30 @@ def get_experiment_df(experiment_dir):
       workload_df = pd.concat([workload_df, trial_df], ignore_index=True)
     df = pd.concat([df, workload_df], ignore_index=True)
   return df
+
+
+## Get workload properties
+def get_workload_validation_target(workload):
+  """Returns workload target metric name and value.
+  """ 
+  print(workload)
+  workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1)
+  framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2)
+  workload_metadata = copy.copy(WORKLOADS[workload_name])
+  print(workload_metadata)
+
+  # Extend path according to framework.
+  workload_metadata['workload_path'] = os.path.join(
+      BASE_WORKLOADS_DIR,
+      workload_metadata['workload_path'] + f'{framework}',
+      'workload.py')
+  workload_init_kwargs = {}
+  print(workload_metadata['workload_path'])
+  workload_obj = workloads_registry.import_workload(
+      workload_path=workload_metadata['workload_path'],
+      workload_class_name=workload_metadata['workload_class_name'],
+      workload_init_kwargs=workload_init_kwargs)
+  metric_name = workload_obj.target_metric_name
+  validation_metric = f'validation/{metric_name}'
+  validation_target = workload_obj.validation_target_value
+  return validation_metric, validation_target
\ No newline at end of file

From aa75f3eaeb4ca71957311b4d6c66f8a325398b98 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Wed, 8 Nov 2023 01:03:34 +0000
Subject: [PATCH 3/8] add scoring table

---
 scoring/score_profile.py    | 368 ------------------------------------
 scoring/score_submission.py |  59 ++++--
 scoring/scoring_utils.py    |  13 +-
 3 files changed, 49 insertions(+), 391 deletions(-)
 delete mode 100644 scoring/score_profile.py

diff --git a/scoring/score_profile.py b/scoring/score_profile.py
deleted file mode 100644
index bf6f01298..000000000
--- a/scoring/score_profile.py
+++ /dev/null
@@ -1,368 +0,0 @@
-"""Performance and scoring code.
-
-The three primary methods exposed by the `scoring` module are:
-- `compute_performance_profiles`: generates performance profiles for a set of
-  submissions over all workloads as defined in the scoring rules:
-  https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md
-- `compute_leaderboard_score`: computes final scores from performance profiles.
-- `plot_performance_profiles`: plot performance profiles for a set of
-  submissions.
-
-The two primary inputs to `compute_performance_profiles` are
-1. A dictionary of pandas DataFrames, where each key is a globally unique
-  identifier for a submission and each value is a DataFrame containing one row
-  per trial per workload in that submission. At minimum, this DataFrame should
-  include a column of np.arrays indicating time (e.g., 'global_step'), a column
-  of np.arrays indicating performance (e.g., 'validation/accuracy') for each
-  workload and a column 'workload' that indicates the workload identifier.
-2. A dictionary of workload metadata describing each workload in the form:
-  {
-    'workload_identifier': {
-      'target': VALUE,
-      'metric': 'validation/error_rate',
-    }
-  }
-  The keys in this dictionary should match the workload identifiers used in
-  the dictionary of submissions.
-"""
-from absl import logging
-import itertools
-import operator
-import os
-import re
-
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import scoring_utils
-
-import algorithmic_efficiency.workloads.workloads as workloads_registry
-
-WORKLOADS = workloads_registry.WORKLOADS
-WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)'
-BASE_WORKLOADS_DIR = 'algorithmic_efficiency/workloads/'
-# These global variables have to be set according to the current set of
-# workloads and rules for the scoring to be correct.
-# We do not use the workload registry since it contains test and development
-# workloads as well.
-NUM_WORKLOADS = 8
-NUM_TRIALS = 5
-
-MIN_EVAL_METRICS = [
-    'ce_loss',
-    'error_rate',
-    'ctc_loss',
-    'wer',
-    'l1_loss',
-    'loss',
-]
-
-MAX_EVAL_METRICS = ['mean_average_precision', 'ssim', 'accuracy', 'bleu']
-
-
-def generate_eval_cols(metrics):
-  splits = ['train', 'validation']
-  return [f'{split}/{col}' for split, col in itertools.product(splits, metrics)]
-
-
-MINIMIZE_REGISTRY = {k: True for k in generate_eval_cols(MIN_EVAL_METRICS)}
-MINIMIZE_REGISTRY.update(
-    {k: False for k in generate_eval_cols(MAX_EVAL_METRICS)})
-MINIMIZE_REGISTRY['train_cost'] = True
-
-
-def check_if_minimized(col_name):
-  """Guess if the eval metric column name should be minimized or not."""
-  for prefix in ['best_', 'final_']:
-    col_name = col_name.replace(prefix, '')
-  for col in MINIMIZE_REGISTRY:
-    if col in col_name:
-      return MINIMIZE_REGISTRY[col]
-
-  raise ValueError(f'Column {col_name} not found in `MINIMIZE_REGISTRY` as '
-                   'either a column name or a substring of a column name.')
-
-
-def get_index_that_reaches_best(workload_df, metric_col):
-  """Get the eval index in which a workload reaches the best on metric_col.
-
-  Args:
-    workload_df: A subset of a submission's trials DataFrame that
-      includes only the trials in a single workload.
-    metric_col: Name of array column in workload_df 
-      (e.g., `validation/l1_loss`).
-
-  Returns:
-    Tuple of trial index, time index, and best value where the workload
-      reached the best metric_col. Return (-1, -1, -1) if no undiverged trials.
-  """
-  is_minimized = check_if_minimized(metric_col)
-  series = workload_df[metric_col]
-
-  series = series[series != np.nan]
-
-  op = np.min if is_minimized else np.max
-  best = series.apply(op)
-
-  op_idx = np.argmin if is_minimized else np.argmax
-  best_idx = series.apply(op_idx)
-
-  if best.empty:
-    return -1, -1, -1
-  else:
-    trial = best.idxmin() if is_minimized else best.idxmax()
-    return trial, best_idx[trial], best[trial]
-
-
-def get_index_that_reaches_target(workload_df,
-                                  validation_metric,
-                                  validation_target):
-  """Get the eval index in which a workload reaches the target metric_col.
-
-  Args:
-    workload_df: A subset of a submission's trials DataFrame that
-      includes only the trials in a single workload.
-    metric_col: Name of array column in workload_df (e.g. `validation/l1_loss`).
-    target: Target value for metric_col.
-
-  Returns:
-    Tuple of trial index and time index where the workload reached the target
-      metric_col. Return (-1, -1) if not reached.
-  """
-  is_minimized = check_if_minimized(validation_metric)
-  validation_series = workload_df[validation_metric]
-  validation_series = validation_series[validation_series != np.nan]
-
-  op = operator.le if is_minimized else operator.ge
-  validation_target_reached = validation_series.apply(
-      lambda x: op(x, validation_target))
-  target_reached = pd.Series(validation_target_reached)
-  # Remove trials that never reach the target
-  target_reached = target_reached[target_reached.apply(np.any)]
-
-  # If less than 3 trials reach the target, the submission will be scored as
-  # missing the target on this workload; return -1. Else, return the eval index
-  # of the earliest point the target is reached.
-  if len(target_reached) < 3:
-    return -1, -1
-  else:
-    index_reached = target_reached.apply(np.argmax)
-    trial = index_reached.idxmin()
-    return trial, index_reached[trial]
-
-
-def get_times_for_submission(submission,
-                             submission_tag,
-                             time_col='global_step',
-                             verbosity=1,
-                             self_tuning_ruleset=False):
-  """Get times to target for each workload in a submission.
-
-  Args:
-    submission: A DataFrame containing one row for each trial in each workload
-      for a given submission.
-    submission_tag: Globally unique identified for a submission.
-    time_col: A string indicating which column to use for time.
-    verbosity: Debug level of information; choice of (1, 2, 3).
-
-  Returns:
-    DataFrame with columns `submission`, `workload`, and time_col.
-  """
-  workloads = []
-  submission_name = submission_tag.split('.')[1]
-  num_workloads = len(submission.groupby('workload'))
-  if num_workloads != NUM_WORKLOADS:
-    logging.warning(f'Expecting {NUM_WORKLOADS} workloads '
-                    f'but found {num_workloads} workloads.')
-  for workload, group in submission.groupby('workload'):
-    num_trials = len(group)
-    if num_trials != NUM_TRIALS and not self_tuning_ruleset:
-      logging.warning(f'Expecting {NUM_TRIALS} trials for workload '
-                    f'{workload} but found {num_trials} trials.')
-    validation_metric, validation_target = scoring_utils.get_workload_validation_target(workload)
-    validation_target = validation_target
-
-    trial_idx, time_idx = get_index_that_reaches_target(
-        group, validation_metric, validation_target)
-    if time_idx > -1:
-      time_val = group[time_col].loc[trial_idx][time_idx]
-    else:
-      time_val = float('inf')
-
-    workloads.append({
-        'submission': submission_name,
-        'workload': workload,
-        time_col: time_val,
-    })
-
-    if verbosity > 0:
-      print('  hparams:')
-      if time_idx > -1:
-        hparams = group.loc[trial_idx, 'hparams']
-        for key, val in hparams.items():
-          print(f'  - {key}: {val}')
-      else:
-        print('Submission did not reach target')
-  df = pd.DataFrame.from_records(workloads)
-  df = df.pivot(index='submission', columns='workload', values=time_col)
-
-  return df
-
-
-def compute_performance_profiles(results,
-                                 time_col='global_step',
-                                 min_tau=1.0,
-                                 max_tau=None,
-                                 reference_submission_tag=None,
-                                 num_points=100,
-                                 scale='linear',
-                                 verbosity=0):
-  """Compute performance profiles for a set of submission by some time column.
-
-  Args:
-    results: Dict where keys are submission names and values are a DataFrame of
-      trials where each row is a trial and each column is a field for a given
-      trial. Results should contain keys for each workload's metric, time_col,
-      'workload'. See file header comment for more details.
-    time_col: A string indicating which column to use for time.
-    min_tau: Minimum tau to use for plotting.
-    max_tau: Maximum tau to use for plotting.
-    reference_submission_tag: If specified, must be an element of
-      `submission_tags`. Used as the denominator for computing tau. Otherwise,
-      the minimum time to target is computed per-workload and used as the
-      denominator for tau.
-    num_points: Number of points to use for plotting.
-    scale: Linear or log scale for the x-axis.
-    verbosity: Debug level of information; choice of (1, 2, 3).
-
-  Returns:
-    A DataFrame of performance profiles for the set of submissions given in
-      `results` based on `time_col`. Each row represents a submission and each
-      column represents rho(tau) for some value of tau (df.volumns are the
-      different values of tau).
-  """
-  dfs = []
-
-  for submission_tag, result in results.items():
-    logging.info(f'\nComputing performance profile with respect to `{time_col}` for '
-          f'{submission_tag}')
-    dfs.append(
-        get_times_for_submission(result, submission_tag, time_col, verbosity))
-  df = pd.concat(dfs)
-
-  if verbosity > 0:
-    logging.info('\n`{time_col}` to reach target:')
-    with pd.option_context('display.max_rows',
-                           None,
-                           'display.max_columns',
-                           None,
-                           'display.width',
-                           1000):
-      logging.info(df)
-
-  # Divide by the fastest.
-  if reference_submission_tag is None:
-    df.update(df.div(df.min(axis=0), axis=1))
-  else:
-    df.update(df.div(df.loc[reference_submission_tag, :], axis=1))
-
-  if verbosity > 0:
-    logging.info('\n`{time_col}` to reach target normalized to best:')
-    with pd.option_context('display.max_rows',
-                           None,
-                           'display.max_columns',
-                           None,
-                           'display.width',
-                           1000):
-      logging.info(df)
-
-  # If no max_tau is supplied, choose the value of tau that would plot all non
-  # inf or nan data.
-  if max_tau is None:
-    max_tau = df.replace(float('inf'), -1).replace(np.nan, -1).values.max()
-
-  if scale == 'linear':
-    points = np.linspace(min_tau, max_tau, num=num_points)
-  elif scale == 'log':
-    points = np.logspace(
-        np.log10(min_tau), np.log10(max_tau), num=num_points, base=10.0)
-
-  def rho(r, tau):
-    return (r <= tau).sum(axis=1) / NUM_WORKLOADS
-
-  perf_df = pd.concat([rho(df, tau) for tau in points], axis=1)
-
-  cols = points
-  if scale == 'log':
-    cols = np.log10(points)
-  perf_df.columns = cols
-
-  return perf_df
-
-
-def compute_leaderboard_score(df, normalize=False):
-  """Compute leaderboard score by taking integral of performance profile.
-
-  Args:
-    df: pd.DataFrame returned from `compute_performance_profiles`.
-    normalize: divide by the range of the performance profile's tau.
-
-  Returns:
-    pd.DataFrame with one column of scores indexed by submission.
-  """
-  scores = np.trapz(df, x=df.columns)
-  if normalize:
-    scores /= df.columns.max() - df.columns.min()
-  return pd.DataFrame(scores, columns=['score'], index=df.index)
-
-
-def maybe_save_figure(save_dir, name, ext='pdf'):
-  """Maybe save the current matplotlib.pyplot figure."""
-  if save_dir:
-    path = os.path.join(save_dir, f'{name}.{ext}')
-    with open(path, 'wb') as fout:
-      plt.savefig(fout, format=ext)
-
-
-def maybe_save_df_to_csv(save_dir, df, path, **to_csv_kwargs):
-  if save_dir:
-    path = os.path.join(save_dir, path)
-    with open(path, 'w') as fout:
-      df.to_csv(fout, **to_csv_kwargs)
-
-
-def plot_performance_profiles(perf_df,
-                              df_col,
-                              scale='linear',
-                              save_dir=None,
-                              figsize=(30, 10),
-                              font_size=18):
-  """Plot performance profiles.
-
-  Args:
-    perf_df: A DataFrame of performance profiles where each row represents a
-      submission and each column represents rho(tau) for some value of tau
-      (df.volumns are the different values of tau).
-    df_col: The column in the original submission results DataFrame used to
-      compute the performance profile. This argument is only used for axis
-      and file naming.
-    scale: Whether or not the data in perf_df is on a linear or log scale. This
-      argument is only used for axis and file naming.
-    save_dir: If a valid directory is provided, save both the plot and perf_df
-      to the provided directory.
-    figsize: The size of the plot.
-    font_size: The font size to use for the legend.
-
-  Returns:
-    None. If a valid save_dir is provided, save both the plot and perf_df.
-  """
-  fig = perf_df.T.plot(figsize=figsize)
-  df_col_display = f'log10({df_col})' if scale == 'log' else df_col
-  fig.set_xlabel(
-      f'Ratio of `{df_col_display}` to best submission', size=font_size)
-  fig.set_ylabel('Proportion of workloads', size=font_size)
-  fig.legend(prop={'size': font_size}, bbox_to_anchor=(1.0, 1.0))
-  maybe_save_figure(save_dir, f'performance_profile_by_{df_col_display}')
-  maybe_save_df_to_csv(save_dir,
-                       perf_df,
-                       f'performance_profile_{df_col_display}.csv')
diff --git a/scoring/score_submission.py b/scoring/score_submission.py
index bd8e44f3b..ad6573589 100644
--- a/scoring/score_submission.py
+++ b/scoring/score_submission.py
@@ -3,9 +3,14 @@
 from absl import app
 from absl import flags
 from absl import logging
+import pandas as pd
+import numpy as np
+import operator
+
 import scoring_utils
 from tabulate import tabulate
-from scoring import score_profile
+from scoring import performance_profile
+from scoring.performance_profile import check_if_minimized
 
 flags.DEFINE_string(
     'experiment_path',
@@ -16,27 +21,54 @@
                     'scoring_results',
                     'Path to save performance profile table and plot.')
 flags.DEFINE_boolean('compute_performance_profiles',
-                    True, 
-                    'Whether or not to compute the performance profiles.')
+                     False,
+                     'Whether or not to compute the performance profiles.')
 FLAGS = flags.FLAGS
 
 
+def get_summary_df(workload, workload_df):
+  validation_metric, validation_target = scoring_utils.get_workload_validation_target(workload)
+  is_minimized = check_if_minimized(validation_metric)
+  target_op = operator.le if is_minimized else operator.ge
+  best_op = min if is_minimized else max
+  idx_op = np.argmin if is_minimized else np.argmax
+
+  summary_df = pd.DataFrame()
+  summary_df['workload'] = workload_df['workload']
+  summary_df['trial'] = workload_df['trial']
+  summary_df['target metric name'] = validation_metric
+  summary_df['target metric value'] = validation_target
+
+  summary_df['target reached'] = workload_df[validation_metric].apply(
+      lambda x: target_op(x, validation_target)).apply(np.any)
+  summary_df['best target'] = workload_df[validation_metric].apply(
+      lambda x: best_op(x))
+  workload_df['index best eval'] = workload_df[validation_metric].apply(
+      lambda x: idx_op(x))
+  summary_df['submission time'] = workload_df.apply(
+      lambda x: x['accumulated_submission_time'][x['index best eval']], axis=1)
+  summary_df['score'] = summary_df.apply(
+      lambda x: x['submission time'] if x['target reached'] else np.inf, axis=1)
+
+  return summary_df
+
+
 def main(_):
   df = scoring_utils.get_experiment_df(FLAGS.experiment_path)
   results = {
       FLAGS.submission_tag: df,
   }
-  table = tabulate(df, headers='keys', tablefmt='psql')
-  logging.info(df)
+
+  dfs = []
   for workload, group in df.groupby('workload'):
-    target_metric_name, target_metric_value = scoring_utils.get_workload_validation_target(workload)
-    print(target_metric_name)
-    print(target_metric_value)
-    # print(workload)
-    # print(group)
+    summary_df = get_summary_df(workload, group)
+    dfs.append(summary_df)
+
+  df = pd.concat(dfs)
+  print(tabulate(df, headers='keys', tablefmt='psql'))
 
   if FLAGS.compute_performance_profiles:
-    performance_profile_df = score_profile.compute_performance_profiles(
+    performance_profile_df = performance_profile.compute_performance_profiles(
         results,
         time_col='score',
         min_tau=1.0,
@@ -47,9 +79,10 @@ def main(_):
         verbosity=0)
     if not os.path.exists(FLAGS.output_dir):
       os.mkdir(FLAGS.output_dir)
-    score_profile.plot_performance_profiles(
+    performance_profile.plot_performance_profiles(
         performance_profile_df, 'score', save_dir=FLAGS.output_dir)
-    perf_df = tabulate(performance_profile_df.T, headers='keys', tablefmt='psql')
+    perf_df = tabulate(
+        performance_profile_df.T, headers='keys', tablefmt='psql')
     logging.info(f'Performance profile:\n {perf_df}')
 
 
diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py
index bc0703a4c..99a58747a 100644
--- a/scoring/scoring_utils.py
+++ b/scoring/scoring_utils.py
@@ -6,10 +6,7 @@
 from absl import logging
 import pandas as pd
 
-
 from algorithmic_efficiency import spec
-from scoring.score_profile import NUM_TRIALS
-from scoring.score_profile import NUM_WORKLOADS
 import algorithmic_efficiency.workloads.workloads as workloads_registry
 
 TRIAL_LINE_REGEX = '(.*) --- Tuning run (\d+)/(\d+) ---'
@@ -21,6 +18,7 @@
 WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)'
 BASE_WORKLOADS_DIR = 'algorithmic_efficiency/workloads/'
 
+
 #### File IO helper functions ###
 def get_logfile_paths(logdir):
   """Gets all files ending in .log in logdir
@@ -166,9 +164,6 @@ def get_experiment_df(experiment_dir):
   df = pd.DataFrame()
   workload_dirs = os.listdir(experiment_dir)
   num_workloads = len(workload_dirs)
-  # if num_workloads != NUM_WORKLOADS:
-  #   warnings.warn(f'There should be {NUM_WORKLOADS} workloads but there are '
-  #                 f'{num_workloads}.')
   for workload in workload_dirs:
     data = {
         'workload': workload,
@@ -203,12 +198,10 @@ def get_experiment_df(experiment_dir):
 ## Get workload properties
 def get_workload_validation_target(workload):
   """Returns workload target metric name and value.
-  """ 
-  print(workload)
+  """
   workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1)
   framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2)
   workload_metadata = copy.copy(WORKLOADS[workload_name])
-  print(workload_metadata)
 
   # Extend path according to framework.
   workload_metadata['workload_path'] = os.path.join(
@@ -224,4 +217,4 @@ def get_workload_validation_target(workload):
   metric_name = workload_obj.target_metric_name
   validation_metric = f'validation/{metric_name}'
   validation_target = workload_obj.validation_target_value
-  return validation_metric, validation_target
\ No newline at end of file
+  return validation_metric, validation_target

From 217e5041b3b39fd7649f6b8dd19136fd2b14b84e Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Wed, 8 Nov 2023 01:16:28 +0000
Subject: [PATCH 4/8] linting

---
 scoring/score_submission.py | 6 +++---
 scoring/scoring_utils.py    | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scoring/score_submission.py b/scoring/score_submission.py
index ad6573589..f284044a3 100644
--- a/scoring/score_submission.py
+++ b/scoring/score_submission.py
@@ -1,14 +1,14 @@
+import operator
 import os
 
 from absl import app
 from absl import flags
 from absl import logging
-import pandas as pd
 import numpy as np
-import operator
-
+import pandas as pd
 import scoring_utils
 from tabulate import tabulate
+
 from scoring import performance_profile
 from scoring.performance_profile import check_if_minimized
 
diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py
index 99a58747a..70fb9b05f 100644
--- a/scoring/scoring_utils.py
+++ b/scoring/scoring_utils.py
@@ -1,7 +1,7 @@
+import copy
 import json
 import os
 import re
-import copy
 
 from absl import logging
 import pandas as pd

From 6a9048d47f8941192388f01e04ddbccc8d49e747 Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Wed, 8 Nov 2023 01:51:49 +0000
Subject: [PATCH 5/8] remove unused import

---
 scoring/scoring_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py
index 70fb9b05f..45665c011 100644
--- a/scoring/scoring_utils.py
+++ b/scoring/scoring_utils.py
@@ -6,7 +6,6 @@
 from absl import logging
 import pandas as pd
 
-from algorithmic_efficiency import spec
 import algorithmic_efficiency.workloads.workloads as workloads_registry
 
 TRIAL_LINE_REGEX = '(.*) --- Tuning run (\d+)/(\d+) ---'

From c668c1cee4a81a6b362dadb8d9d8585f6ed1a24a Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Wed, 8 Nov 2023 20:02:34 +0000
Subject: [PATCH 6/8] add performance_profile.py

---
 scoring/performance_profile.py | 369 +++++++++++++++++++++++++++++++++
 1 file changed, 369 insertions(+)
 create mode 100644 scoring/performance_profile.py

diff --git a/scoring/performance_profile.py b/scoring/performance_profile.py
new file mode 100644
index 000000000..3aa531e26
--- /dev/null
+++ b/scoring/performance_profile.py
@@ -0,0 +1,369 @@
+"""Performance and scoring code.
+
+The three primary methods exposed by the `scoring` module are:
+- `compute_performance_profiles`: generates performance profiles for a set of
+  submissions over all workloads as defined in the scoring rules:
+  https://github.com/mlcommons/algorithmic-efficiency/blob/main/RULES.md
+- `compute_leaderboard_score`: computes final scores from performance profiles.
+- `plot_performance_profiles`: plot performance profiles for a set of
+  submissions.
+
+The two primary inputs to `compute_performance_profiles` are
+1. A dictionary of pandas DataFrames, where each key is a globally unique
+  identifier for a submission and each value is a DataFrame containing one row
+  per trial per workload in that submission. At minimum, this DataFrame should
+  include a column of np.arrays indicating time (e.g., 'global_step'), a column
+  of np.arrays indicating performance (e.g., 'validation/accuracy') for each
+  workload and a column 'workload' that indicates the workload identifier.
+2. A dictionary of workload metadata describing each workload in the form:
+  {
+    'workload_identifier': {
+      'target': VALUE,
+      'metric': 'validation/error_rate',
+    }
+  }
+  The keys in this dictionary should match the workload identifiers used in
+  the dictionary of submissions.
+"""
+import itertools
+import operator
+import os
+import re
+
+from absl import logging
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import scoring_utils
+
+import algorithmic_efficiency.workloads.workloads as workloads_registry
+
+WORKLOADS = workloads_registry.WORKLOADS
+WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)'
+BASE_WORKLOADS_DIR = 'algorithmic_efficiency/workloads/'
+# These global variables have to be set according to the current set of
+# workloads and rules for the scoring to be correct.
+# We do not use the workload registry since it contains test and development
+# workloads as well.
+NUM_WORKLOADS = 8
+NUM_TRIALS = 5
+
+MIN_EVAL_METRICS = [
+    'ce_loss',
+    'error_rate',
+    'ctc_loss',
+    'wer',
+    'l1_loss',
+    'loss',
+]
+
+MAX_EVAL_METRICS = ['mean_average_precision', 'ssim', 'accuracy', 'bleu']
+
+
+def generate_eval_cols(metrics):
+  splits = ['train', 'validation']
+  return [f'{split}/{col}' for split, col in itertools.product(splits, metrics)]
+
+
+MINIMIZE_REGISTRY = {k: True for k in generate_eval_cols(MIN_EVAL_METRICS)}
+MINIMIZE_REGISTRY.update(
+    {k: False for k in generate_eval_cols(MAX_EVAL_METRICS)})
+MINIMIZE_REGISTRY['train_cost'] = True
+
+
+def check_if_minimized(col_name):
+  """Guess if the eval metric column name should be minimized or not."""
+  for prefix in ['best_', 'final_']:
+    col_name = col_name.replace(prefix, '')
+  for col in MINIMIZE_REGISTRY:
+    if col in col_name:
+      return MINIMIZE_REGISTRY[col]
+
+  raise ValueError(f'Column {col_name} not found in `MINIMIZE_REGISTRY` as '
+                   'either a column name or a substring of a column name.')
+
+
+def get_index_that_reaches_best(workload_df, metric_col):
+  """Get the eval index in which a workload reaches the best on metric_col.
+
+  Args:
+    workload_df: A subset of a submission's trials DataFrame that
+      includes only the trials in a single workload.
+    metric_col: Name of array column in workload_df 
+      (e.g., `validation/l1_loss`).
+
+  Returns:
+    Tuple of trial index, time index, and best value where the workload
+      reached the best metric_col. Return (-1, -1, -1) if no undiverged trials.
+  """
+  is_minimized = check_if_minimized(metric_col)
+  series = workload_df[metric_col]
+
+  series = series[series != np.nan]
+
+  op = np.min if is_minimized else np.max
+  best = series.apply(op)
+
+  op_idx = np.argmin if is_minimized else np.argmax
+  best_idx = series.apply(op_idx)
+
+  if best.empty:
+    return -1, -1, -1
+  else:
+    trial = best.idxmin() if is_minimized else best.idxmax()
+    return trial, best_idx[trial], best[trial]
+
+
+def get_index_that_reaches_target(workload_df,
+                                  validation_metric,
+                                  validation_target):
+  """Get the eval index in which a workload reaches the target metric_col.
+
+  Args:
+    workload_df: A subset of a submission's trials DataFrame that
+      includes only the trials in a single workload.
+    metric_col: Name of array column in workload_df (e.g. `validation/l1_loss`).
+    target: Target value for metric_col.
+
+  Returns:
+    Tuple of trial index and time index where the workload reached the target
+      metric_col. Return (-1, -1) if not reached.
+  """
+  is_minimized = check_if_minimized(validation_metric)
+  validation_series = workload_df[validation_metric]
+  validation_series = validation_series[validation_series != np.nan]
+
+  op = operator.le if is_minimized else operator.ge
+  validation_target_reached = validation_series.apply(
+      lambda x: op(x, validation_target))
+  target_reached = pd.Series(validation_target_reached)
+  # Remove trials that never reach the target
+  target_reached = target_reached[target_reached.apply(np.any)]
+
+  # If less than 3 trials reach the target, the submission will be scored as
+  # missing the target on this workload; return -1. Else, return the eval index
+  # of the earliest point the target is reached.
+  if len(target_reached) < 3:
+    return -1, -1
+  else:
+    index_reached = target_reached.apply(np.argmax)
+    trial = index_reached.idxmin()
+    return trial, index_reached[trial]
+
+
+def get_times_for_submission(submission,
+                             submission_tag,
+                             time_col='global_step',
+                             verbosity=1,
+                             self_tuning_ruleset=False):
+  """Get times to target for each workload in a submission.
+
+  Args:
+    submission: A DataFrame containing one row for each trial in each workload
+      for a given submission.
+    submission_tag: Globally unique identified for a submission.
+    time_col: A string indicating which column to use for time.
+    verbosity: Debug level of information; choice of (1, 2, 3).
+
+  Returns:
+    DataFrame with columns `submission`, `workload`, and time_col.
+  """
+  workloads = []
+  submission_name = submission_tag.split('.')[1]
+  num_workloads = len(submission.groupby('workload'))
+  if num_workloads != NUM_WORKLOADS:
+    logging.warning(f'Expecting {NUM_WORKLOADS} workloads '
+                    f'but found {num_workloads} workloads.')
+  for workload, group in submission.groupby('workload'):
+    num_trials = len(group)
+    if num_trials != NUM_TRIALS and not self_tuning_ruleset:
+      logging.warning(f'Expecting {NUM_TRIALS} trials for workload '
+                      f'{workload} but found {num_trials} trials.')
+    validation_metric, validation_target = scoring_utils.get_workload_validation_target(workload)
+    validation_target = validation_target
+
+    trial_idx, time_idx = get_index_that_reaches_target(
+        group, validation_metric, validation_target)
+    if time_idx > -1:
+      time_val = group[time_col].loc[trial_idx][time_idx]
+    else:
+      time_val = float('inf')
+
+    workloads.append({
+        'submission': submission_name,
+        'workload': workload,
+        time_col: time_val,
+    })
+
+    if verbosity > 0:
+      print('  hparams:')
+      if time_idx > -1:
+        hparams = group.loc[trial_idx, 'hparams']
+        for key, val in hparams.items():
+          print(f'  - {key}: {val}')
+      else:
+        print('Submission did not reach target')
+  df = pd.DataFrame.from_records(workloads)
+  df = df.pivot(index='submission', columns='workload', values=time_col)
+
+  return df
+
+
+def compute_performance_profiles(results,
+                                 time_col='global_step',
+                                 min_tau=1.0,
+                                 max_tau=None,
+                                 reference_submission_tag=None,
+                                 num_points=100,
+                                 scale='linear',
+                                 verbosity=0):
+  """Compute performance profiles for a set of submission by some time column.
+
+  Args:
+    results: Dict where keys are submission names and values are a DataFrame of
+      trials where each row is a trial and each column is a field for a given
+      trial. Results should contain keys for each workload's metric, time_col,
+      'workload'. See file header comment for more details.
+    time_col: A string indicating which column to use for time.
+    min_tau: Minimum tau to use for plotting.
+    max_tau: Maximum tau to use for plotting.
+    reference_submission_tag: If specified, must be an element of
+      `submission_tags`. Used as the denominator for computing tau. Otherwise,
+      the minimum time to target is computed per-workload and used as the
+      denominator for tau.
+    num_points: Number of points to use for plotting.
+    scale: Linear or log scale for the x-axis.
+    verbosity: Debug level of information; choice of (1, 2, 3).
+
+  Returns:
+    A DataFrame of performance profiles for the set of submissions given in
+      `results` based on `time_col`. Each row represents a submission and each
+      column represents rho(tau) for some value of tau (df.volumns are the
+      different values of tau).
+  """
+  dfs = []
+
+  for submission_tag, result in results.items():
+    logging.info(
+        f'\nComputing performance profile with respect to `{time_col}` for '
+        f'{submission_tag}')
+    dfs.append(
+        get_times_for_submission(result, submission_tag, time_col, verbosity))
+  df = pd.concat(dfs)
+
+  if verbosity > 0:
+    logging.info('\n`{time_col}` to reach target:')
+    with pd.option_context('display.max_rows',
+                           None,
+                           'display.max_columns',
+                           None,
+                           'display.width',
+                           1000):
+      logging.info(df)
+
+  # Divide by the fastest.
+  if reference_submission_tag is None:
+    df.update(df.div(df.min(axis=0), axis=1))
+  else:
+    df.update(df.div(df.loc[reference_submission_tag, :], axis=1))
+
+  if verbosity > 0:
+    logging.info('\n`{time_col}` to reach target normalized to best:')
+    with pd.option_context('display.max_rows',
+                           None,
+                           'display.max_columns',
+                           None,
+                           'display.width',
+                           1000):
+      logging.info(df)
+
+  # If no max_tau is supplied, choose the value of tau that would plot all non
+  # inf or nan data.
+  if max_tau is None:
+    max_tau = df.replace(float('inf'), -1).replace(np.nan, -1).values.max()
+
+  if scale == 'linear':
+    points = np.linspace(min_tau, max_tau, num=num_points)
+  elif scale == 'log':
+    points = np.logspace(
+        np.log10(min_tau), np.log10(max_tau), num=num_points, base=10.0)
+
+  def rho(r, tau):
+    return (r <= tau).sum(axis=1) / NUM_WORKLOADS
+
+  perf_df = pd.concat([rho(df, tau) for tau in points], axis=1)
+
+  cols = points
+  if scale == 'log':
+    cols = np.log10(points)
+  perf_df.columns = cols
+
+  return perf_df
+
+
+def compute_leaderboard_score(df, normalize=False):
+  """Compute leaderboard score by taking integral of performance profile.
+
+  Args:
+    df: pd.DataFrame returned from `compute_performance_profiles`.
+    normalize: divide by the range of the performance profile's tau.
+
+  Returns:
+    pd.DataFrame with one column of scores indexed by submission.
+  """
+  scores = np.trapz(df, x=df.columns)
+  if normalize:
+    scores /= df.columns.max() - df.columns.min()
+  return pd.DataFrame(scores, columns=['score'], index=df.index)
+
+
+def maybe_save_figure(save_dir, name, ext='pdf'):
+  """Maybe save the current matplotlib.pyplot figure."""
+  if save_dir:
+    path = os.path.join(save_dir, f'{name}.{ext}')
+    with open(path, 'wb') as fout:
+      plt.savefig(fout, format=ext)
+
+
+def maybe_save_df_to_csv(save_dir, df, path, **to_csv_kwargs):
+  if save_dir:
+    path = os.path.join(save_dir, path)
+    with open(path, 'w') as fout:
+      df.to_csv(fout, **to_csv_kwargs)
+
+
+def plot_performance_profiles(perf_df,
+                              df_col,
+                              scale='linear',
+                              save_dir=None,
+                              figsize=(30, 10),
+                              font_size=18):
+  """Plot performance profiles.
+
+  Args:
+    perf_df: A DataFrame of performance profiles where each row represents a
+      submission and each column represents rho(tau) for some value of tau
+      (df.volumns are the different values of tau).
+    df_col: The column in the original submission results DataFrame used to
+      compute the performance profile. This argument is only used for axis
+      and file naming.
+    scale: Whether or not the data in perf_df is on a linear or log scale. This
+      argument is only used for axis and file naming.
+    save_dir: If a valid directory is provided, save both the plot and perf_df
+      to the provided directory.
+    figsize: The size of the plot.
+    font_size: The font size to use for the legend.
+
+  Returns:
+    None. If a valid save_dir is provided, save both the plot and perf_df.
+  """
+  fig = perf_df.T.plot(figsize=figsize)
+  df_col_display = f'log10({df_col})' if scale == 'log' else df_col
+  fig.set_xlabel(
+      f'Ratio of `{df_col_display}` to best submission', size=font_size)
+  fig.set_ylabel('Proportion of workloads', size=font_size)
+  fig.legend(prop={'size': font_size}, bbox_to_anchor=(1.0, 1.0))
+  maybe_save_figure(save_dir, f'performance_profile_by_{df_col_display}')
+  maybe_save_df_to_csv(save_dir,
+                       perf_df,
+                       f'performance_profile_{df_col_display}.csv')

From 11e70c0f91915eb8d4f5a9059967941fbcb1820c Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Thu, 9 Nov 2023 21:06:56 +0000
Subject: [PATCH 7/8] style fixes

---
 scoring/performance_profile.py | 3 +--
 scoring/score_submission.py    | 5 ++---
 scoring/scoring_utils.py       | 6 ++----
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/scoring/performance_profile.py b/scoring/performance_profile.py
index 3aa531e26..01241f95d 100644
--- a/scoring/performance_profile.py
+++ b/scoring/performance_profile.py
@@ -34,7 +34,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-import scoring_utils
+from scoring import scoring_utils
 
 import algorithmic_efficiency.workloads.workloads as workloads_registry
 
@@ -180,7 +180,6 @@ def get_times_for_submission(submission,
       logging.warning(f'Expecting {NUM_TRIALS} trials for workload '
                       f'{workload} but found {num_trials} trials.')
     validation_metric, validation_target = scoring_utils.get_workload_validation_target(workload)
-    validation_target = validation_target
 
     trial_idx, time_idx = get_index_that_reaches_target(
         group, validation_metric, validation_target)
diff --git a/scoring/score_submission.py b/scoring/score_submission.py
index f284044a3..0dd84ff55 100644
--- a/scoring/score_submission.py
+++ b/scoring/score_submission.py
@@ -10,7 +10,6 @@
 from tabulate import tabulate
 
 from scoring import performance_profile
-from scoring.performance_profile import check_if_minimized
 
 flags.DEFINE_string(
     'experiment_path',
@@ -28,7 +27,7 @@
 
 def get_summary_df(workload, workload_df):
   validation_metric, validation_target = scoring_utils.get_workload_validation_target(workload)
-  is_minimized = check_if_minimized(validation_metric)
+  is_minimized = performance_profile.check_if_minimized(validation_metric)
   target_op = operator.le if is_minimized else operator.ge
   best_op = min if is_minimized else max
   idx_op = np.argmin if is_minimized else np.argmax
@@ -65,7 +64,7 @@ def main(_):
     dfs.append(summary_df)
 
   df = pd.concat(dfs)
-  print(tabulate(df, headers='keys', tablefmt='psql'))
+  logging.info(tabulate(df, headers='keys', tablefmt='psql'))
 
   if FLAGS.compute_performance_profiles:
     performance_profile_df = performance_profile.compute_performance_profiles(
diff --git a/scoring/scoring_utils.py b/scoring/scoring_utils.py
index 45665c011..d10617896 100644
--- a/scoring/scoring_utils.py
+++ b/scoring/scoring_utils.py
@@ -140,7 +140,7 @@ def get_trials_df(logfile):
 def get_experiment_df(experiment_dir):
   """Gets a df of per trial results from an experiment dir.
   The output df can be provided as input to 
-  score_profilecompute_performance_profiles. 
+  performance_profile.compute_performance_profiles. 
   Args:
       experiment_dir: path to experiment directory containing 
         results for workloads.
@@ -196,8 +196,7 @@ def get_experiment_df(experiment_dir):
 
 ## Get workload properties
 def get_workload_validation_target(workload):
-  """Returns workload target metric name and value.
-  """
+  """Returns workload target metric name and value."""
   workload_name = re.match(WORKLOAD_NAME_PATTERN, workload).group(1)
   framework = re.match(WORKLOAD_NAME_PATTERN, workload).group(2)
   workload_metadata = copy.copy(WORKLOADS[workload_name])
@@ -208,7 +207,6 @@ def get_workload_validation_target(workload):
       workload_metadata['workload_path'] + f'{framework}',
       'workload.py')
   workload_init_kwargs = {}
-  print(workload_metadata['workload_path'])
   workload_obj = workloads_registry.import_workload(
       workload_path=workload_metadata['workload_path'],
       workload_class_name=workload_metadata['workload_class_name'],

From dca6a26f7c50cccbce661db74626a82dea15d4ff Mon Sep 17 00:00:00 2001
From: Priya Kasimbeg <kasimbeg@google.com>
Date: Thu, 9 Nov 2023 21:22:14 +0000
Subject: [PATCH 8/8] import fix

---
 scoring/performance_profile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scoring/performance_profile.py b/scoring/performance_profile.py
index 01241f95d..e62e8e18e 100644
--- a/scoring/performance_profile.py
+++ b/scoring/performance_profile.py
@@ -34,9 +34,9 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from scoring import scoring_utils
 
 import algorithmic_efficiency.workloads.workloads as workloads_registry
+from scoring import scoring_utils
 
 WORKLOADS = workloads_registry.WORKLOADS
 WORKLOAD_NAME_PATTERN = '(.*)(_jax|_pytorch)'