From 9367151634325f42c2d9cf2e865f117399ad68b4 Mon Sep 17 00:00:00 2001
From: Aliaksandr Ivanou <aivanou@fb.com>
Date: Thu, 14 Oct 2021 11:36:21 -0700
Subject: [PATCH 01/14] Add interpret docs to example component, remove `test`
 arg from cv trainer

---
 torchx/components/interpret.py                |   2 +-
 .../apps/lightning_classy_vision/component.py | 106 ++++++++++--------
 .../apps/lightning_classy_vision/data.py      |   2 +-
 .../apps/lightning_classy_vision/interpret.py |  12 +-
 .../test/train_test.py                        |  29 +++++
 .../apps/lightning_classy_vision/train.py     |   9 +-
 6 files changed, 101 insertions(+), 59 deletions(-)
 create mode 100644 torchx/examples/apps/lightning_classy_vision/test/train_test.py

diff --git a/torchx/components/interpret.py b/torchx/components/interpret.py
index 6b10c5e29..178da14e9 100644
--- a/torchx/components/interpret.py
+++ b/torchx/components/interpret.py
@@ -16,6 +16,6 @@
 See the
 :ref:`examples_apps/lightning_classy_vision/interpret:Model Interpretability App Example`
 and the corresponding
-:ref:`Interpret component definition<examples_apps/lightning_classy_vision/component:Trainer Component Examples>`
+:ref:`Interpret component definition<examples_apps/lightning_classy_vision/component:Model Interpret>`
 for an example of how to use Captum with TorchX.
 """
diff --git a/torchx/examples/apps/lightning_classy_vision/component.py b/torchx/examples/apps/lightning_classy_vision/component.py
index 820777ae9..ba886d47d 100644
--- a/torchx/examples/apps/lightning_classy_vision/component.py
+++ b/torchx/examples/apps/lightning_classy_vision/component.py
@@ -46,24 +46,24 @@
 #
 #    torchx run --scheduler local_cwd \
 #    ./torchx/examples/apps/lightning_classy_vision/component.py:trainer \
-#    --output_path /tmp
+#    --output_path /tmp/$USER
 #
 # Single trainer component code:
 
 
 def trainer(
-    output_path: str,
-    image: str = TORCHX_IMAGE,
-    data_path: Optional[str] = None,
-    load_path: str = "",
-    log_path: str = "/tmp/logs",
-    resource: Optional[str] = None,
-    env: Optional[Dict[str, str]] = None,
-    skip_export: bool = False,
-    epochs: int = 1,
-    layers: Optional[List[int]] = None,
-    learning_rate: Optional[float] = None,
-    num_samples: int = 200,
+        output_path: str,
+        image: str = TORCHX_IMAGE,
+        data_path: Optional[str] = None,
+        load_path: str = "",
+        log_path: str = "/tmp/logs",
+        resource: Optional[str] = None,
+        env: Optional[Dict[str, str]] = None,
+        skip_export: bool = False,
+        epochs: int = 1,
+        layers: Optional[List[int]] = None,
+        learning_rate: Optional[float] = None,
+        num_samples: int = 200,
 ) -> torchx.AppDef:
     """Runs the example lightning_classy_vision app.
 
@@ -170,19 +170,19 @@ def trainer(
 
 
 def trainer_dist(
-    output_path: str,
-    image: str = TORCHX_IMAGE,
-    data_path: Optional[str] = None,
-    load_path: str = "",
-    log_path: str = "/tmp/logs",
-    resource: Optional[str] = None,
-    env: Optional[Dict[str, str]] = None,
-    skip_export: bool = False,
-    epochs: int = 1,
-    nnodes: int = 1,
-    nproc_per_node: int = 1,
-    rdzv_backend: str = "etcd",
-    rdzv_endpoint: str = "etcd-server:2379",
+        output_path: str,
+        image: str = TORCHX_IMAGE,
+        data_path: Optional[str] = None,
+        load_path: str = "",
+        log_path: str = "/tmp/logs",
+        resource: Optional[str] = None,
+        env: Optional[Dict[str, str]] = None,
+        skip_export: bool = False,
+        epochs: int = 1,
+        nnodes: int = 1,
+        nproc_per_node: int = 1,
+        rdzv_backend: str = "etcd",
+        rdzv_endpoint: str = "etcd-server:2379",
 ) -> torchx.AppDef:
     """Runs the example lightning_classy_vision app.
 
@@ -258,44 +258,58 @@ def trainer_dist(
 
 
 # %%
-# Model Interpretability
-# #######################
-# TODO(aivanou): add documentation
+# Model Interpret
+# #################
+# Defines interpret component
+#
+# Train a single trainer example: :ref:`examples_apps/lightning_classy_vision/component:Single Trainer Component`
+# And use the following cmd to try out:
+#
+# .. code:: bash
+#
+#    torchx run --scheduler local_cwd \
+#    ./torchx/examples/apps/lightning_classy_vision/component.py:interpret \
+#    --output_path /tmp/aivanou/interpret  --load_path /tmp/$USER/last.ckpt
 
 
 def interpret(
-    image: str,
-    load_path: str,
-    data_path: str,
-    output_path: str,
-    resource: Optional[str] = None,
+        load_path: str,
+        output_path: str,
+        data_path: Optional[str] = None,
+        image: str = TORCHX_IMAGE,
+        resource: Optional[str] = None,
 ) -> torchx.AppDef:
     """Runs the model interpretability app on the model outputted by the training
     component.
 
     Args:
-        image: image to run (e.g. foobar:latest)
         load_path: path to load pretrained model from
-        data_path: path to the data to load
         output_path: output path for model checkpoints (e.g. file:///foo/bar)
+        data_path: path to the data to load
+        image: image to run (e.g. foobar:latest)
         resource: the resources to use
     """
+    args = [
+        "-m",
+        "torchx.examples.apps.lightning_classy_vision.interpret",
+        "--load_path",
+        load_path,
+        "--output_path",
+        output_path,
+    ]
+    if data_path:
+        args += [
+            "--data_path",
+            data_path,
+        ]
+
     return torchx.AppDef(
         name="cv-interpret",
         roles=[
             torchx.Role(
                 name="worker",
                 entrypoint="python",
-                args=[
-                    "-m",
-                    "torchx.examples.apps.lightning_classy_vision.interpret",
-                    "--load_path",
-                    load_path,
-                    "--data_path",
-                    data_path,
-                    "--output_path",
-                    output_path,
-                ],
+                args=args,
                 image=image,
                 resource=named_resources[resource]
                 if resource
diff --git a/torchx/examples/apps/lightning_classy_vision/data.py b/torchx/examples/apps/lightning_classy_vision/data.py
index 56ceb9e6a..68aeced43 100644
--- a/torchx/examples/apps/lightning_classy_vision/data.py
+++ b/torchx/examples/apps/lightning_classy_vision/data.py
@@ -148,7 +148,7 @@ def download_data(remote_path: str, tmpdir: str) -> str:
     return data_path
 
 
-def create_random_data(output_path: str, num_images: int = 5) -> None:
+def create_random_data(output_path: str, num_images: int = 250) -> None:
     """
     Fills the given path with randomly generated 64x64 images.
     This can be used for quick testing of the workflow of the model.
diff --git a/torchx/examples/apps/lightning_classy_vision/interpret.py b/torchx/examples/apps/lightning_classy_vision/interpret.py
index 7f709e2d2..84cd653e4 100755
--- a/torchx/examples/apps/lightning_classy_vision/interpret.py
+++ b/torchx/examples/apps/lightning_classy_vision/interpret.py
@@ -35,10 +35,10 @@
 from torchx.examples.apps.lightning_classy_vision.data import (
     TinyImageNetDataModule,
     download_data,
+    create_random_data,
 )
 from torchx.examples.apps.lightning_classy_vision.model import TinyImageNetModel
 
-
 # FIXME: captum must be imported after torch otherwise it causes python to crash
 if True:
     import numpy as np
@@ -57,8 +57,7 @@ def parse_args(argv: List[str]) -> argparse.Namespace:
     parser.add_argument(
         "--data_path",
         type=str,
-        help="path to load the training data from",
-        required=True,
+        help="path to load the training data from, if not provided, random dataset will be created",
     )
     parser.add_argument(
         "--output_path",
@@ -91,7 +90,12 @@ def main(argv: List[str]) -> None:
         model.load_from_checkpoint(checkpoint_path=args.load_path)
 
         # Download and setup the data module
-        data_path = download_data(args.data_path, tmpdir)
+        if not args.data_path:
+            data_path = os.path.join(tmpdir, "data")
+            os.makedirs(data_path)
+            create_random_data(data_path)
+        else:
+            data_path = download_data(args.data_path, tmpdir)
         data = TinyImageNetDataModule(
             data_dir=data_path,
             batch_size=1,
diff --git a/torchx/examples/apps/lightning_classy_vision/test/train_test.py b/torchx/examples/apps/lightning_classy_vision/test/train_test.py
new file mode 100644
index 000000000..ba2e510b3
--- /dev/null
+++ b/torchx/examples/apps/lightning_classy_vision/test/train_test.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+import torchx.examples.apps.lightning_classy_vision.interpret as interpret
+
+
+class ModelTest(unittest.TestCase):
+    def test_basic(self) -> None:
+        model = TinyImageNetModel()
+        self.assertEqual(len(model.seq), 1)
+        out = model(torch.zeros((1, 64, 64)))
+        self.assertIsNotNone(out)
+
+    def test_layer_sizes(self) -> None:
+        model = TinyImageNetModel(
+            layer_sizes=[
+                10,
+                15,
+            ],
+        )
+        self.assertEqual(len(model.seq), 5)
+        out = model(torch.zeros((1, 64, 64)))
+        self.assertIsNotNone(out)
diff --git a/torchx/examples/apps/lightning_classy_vision/train.py b/torchx/examples/apps/lightning_classy_vision/train.py
index 0db67f097..309fe7adf 100755
--- a/torchx/examples/apps/lightning_classy_vision/train.py
+++ b/torchx/examples/apps/lightning_classy_vision/train.py
@@ -56,12 +56,7 @@ def parse_args(argv: List[str]) -> argparse.Namespace:
     parser.add_argument(
         "--batch_size", type=int, default=32, help="batch size to use for training"
     )
-    parser.add_argument("--num_samples", type=int, default=None, help="num_samples")
-    parser.add_argument(
-        "--test",
-        help="Sets to test mode, training on a much smaller set of randomly generated images",
-        action="store_true",
-    )
+    parser.add_argument("--num_samples", type=int, default=10, help="num_samples")
     parser.add_argument(
         "--data_path",
         type=str,
@@ -113,7 +108,7 @@ def main(argv: List[str]) -> None:
         data = TinyImageNetDataModule(
             data_dir=data_path,
             batch_size=args.batch_size,
-            num_samples=5 if args.test else args.num_samples,
+            num_samples=args.num_samples,
         )
 
         # Setup model checkpointing

From 8f798bc0e5400dd34cf898d4c3b08ae39089c3ee Mon Sep 17 00:00:00 2001
From: Aliaksandr Ivanou <aivanou@fb.com>
Date: Thu, 14 Oct 2021 11:40:34 -0700
Subject: [PATCH 02/14] Resolve lint errors

---
 .../apps/lightning_classy_vision/component.py | 60 +++++++++----------
 .../test/train_test.py                        | 29 ---------
 2 files changed, 30 insertions(+), 59 deletions(-)
 delete mode 100644 torchx/examples/apps/lightning_classy_vision/test/train_test.py

diff --git a/torchx/examples/apps/lightning_classy_vision/component.py b/torchx/examples/apps/lightning_classy_vision/component.py
index ba886d47d..4e069e066 100644
--- a/torchx/examples/apps/lightning_classy_vision/component.py
+++ b/torchx/examples/apps/lightning_classy_vision/component.py
@@ -52,18 +52,18 @@
 
 
 def trainer(
-        output_path: str,
-        image: str = TORCHX_IMAGE,
-        data_path: Optional[str] = None,
-        load_path: str = "",
-        log_path: str = "/tmp/logs",
-        resource: Optional[str] = None,
-        env: Optional[Dict[str, str]] = None,
-        skip_export: bool = False,
-        epochs: int = 1,
-        layers: Optional[List[int]] = None,
-        learning_rate: Optional[float] = None,
-        num_samples: int = 200,
+    output_path: str,
+    image: str = TORCHX_IMAGE,
+    data_path: Optional[str] = None,
+    load_path: str = "",
+    log_path: str = "/tmp/logs",
+    resource: Optional[str] = None,
+    env: Optional[Dict[str, str]] = None,
+    skip_export: bool = False,
+    epochs: int = 1,
+    layers: Optional[List[int]] = None,
+    learning_rate: Optional[float] = None,
+    num_samples: int = 200,
 ) -> torchx.AppDef:
     """Runs the example lightning_classy_vision app.
 
@@ -170,19 +170,19 @@ def trainer(
 
 
 def trainer_dist(
-        output_path: str,
-        image: str = TORCHX_IMAGE,
-        data_path: Optional[str] = None,
-        load_path: str = "",
-        log_path: str = "/tmp/logs",
-        resource: Optional[str] = None,
-        env: Optional[Dict[str, str]] = None,
-        skip_export: bool = False,
-        epochs: int = 1,
-        nnodes: int = 1,
-        nproc_per_node: int = 1,
-        rdzv_backend: str = "etcd",
-        rdzv_endpoint: str = "etcd-server:2379",
+    output_path: str,
+    image: str = TORCHX_IMAGE,
+    data_path: Optional[str] = None,
+    load_path: str = "",
+    log_path: str = "/tmp/logs",
+    resource: Optional[str] = None,
+    env: Optional[Dict[str, str]] = None,
+    skip_export: bool = False,
+    epochs: int = 1,
+    nnodes: int = 1,
+    nproc_per_node: int = 1,
+    rdzv_backend: str = "etcd",
+    rdzv_endpoint: str = "etcd-server:2379",
 ) -> torchx.AppDef:
     """Runs the example lightning_classy_vision app.
 
@@ -273,11 +273,11 @@ def trainer_dist(
 
 
 def interpret(
-        load_path: str,
-        output_path: str,
-        data_path: Optional[str] = None,
-        image: str = TORCHX_IMAGE,
-        resource: Optional[str] = None,
+    load_path: str,
+    output_path: str,
+    data_path: Optional[str] = None,
+    image: str = TORCHX_IMAGE,
+    resource: Optional[str] = None,
 ) -> torchx.AppDef:
     """Runs the model interpretability app on the model outputted by the training
     component.
diff --git a/torchx/examples/apps/lightning_classy_vision/test/train_test.py b/torchx/examples/apps/lightning_classy_vision/test/train_test.py
deleted file mode 100644
index ba2e510b3..000000000
--- a/torchx/examples/apps/lightning_classy_vision/test/train_test.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-import torch
-import torchx.examples.apps.lightning_classy_vision.interpret as interpret
-
-
-class ModelTest(unittest.TestCase):
-    def test_basic(self) -> None:
-        model = TinyImageNetModel()
-        self.assertEqual(len(model.seq), 1)
-        out = model(torch.zeros((1, 64, 64)))
-        self.assertIsNotNone(out)
-
-    def test_layer_sizes(self) -> None:
-        model = TinyImageNetModel(
-            layer_sizes=[
-                10,
-                15,
-            ],
-        )
-        self.assertEqual(len(model.seq), 5)
-        out = model(torch.zeros((1, 64, 64)))
-        self.assertIsNotNone(out)

From 873f5092310f86cf8f0f1a24e9e80aa8d15833fa Mon Sep 17 00:00:00 2001
From: Kiuk Chung <kiuk@fb.com>
Date: Thu, 14 Oct 2021 12:13:10 -0700
Subject: [PATCH 03/14] (torchx/runner) standardize class and param naming
 around runopts and runcfg (#252)

Summary:
Pull Request resolved: https://github.com/pytorch/torchx/pull/252

closes: https://github.com/pytorch/torchx/issues/250

I've done a few things on this diff:

1. Renamed `torchx.specs.api.Runopt` to `torchx.specs.runopt` (for consistency with `runopts`)
2. Renamed variables (where I could) `runcfg` to `cfg` (to be consistent with the scheduler and runner apis)
3. Renamed the config section to `[$profile.$sched.cfg]`  instead of `[$profile.scheduler_args.$sched]`
4. Changed the torchx run cli's `-a` (short for `--scheduler_args`) to `-cfg` for consistency with the rest of the system.

Reviewed By: aivanou

Differential Revision: D31656766

fbshipit-source-id: 8c009852d5807010ac4cd33902b294cff4bd0ec1
---
 torchx/cli/cmd_run.py             |  14 ++---
 torchx/runner/api.py              |   2 +-
 torchx/runner/config.py           |  52 ++++++++++------
 torchx/runner/test/config_test.py | 100 +++++++++++++++---------------
 torchx/specs/__init__.py          |  38 ++++++------
 torchx/specs/api.py               |  21 +++----
 6 files changed, 119 insertions(+), 108 deletions(-)

diff --git a/torchx/cli/cmd_run.py b/torchx/cli/cmd_run.py
index c72e10f61..1d1189db2 100644
--- a/torchx/cli/cmd_run.py
+++ b/torchx/cli/cmd_run.py
@@ -10,18 +10,18 @@
 import sys
 from dataclasses import asdict
 from pprint import pformat
-from typing import Dict, List, cast, Type, Optional
+from typing import Dict, List, Optional, Type, cast
 
 import torchx.specs as specs
 from pyre_extensions import none_throws
 from torchx.cli.cmd_base import SubCommand
 from torchx.runner import Runner, get_runner
-from torchx.schedulers import get_scheduler_factories, get_default_scheduler_name
+from torchx.schedulers import get_default_scheduler_name, get_scheduler_factories
 from torchx.specs.finder import (
+    ComponentNotFoundException,
+    ComponentValidationException,
     _Component,
     get_components,
-    ComponentValidationException,
-    ComponentNotFoundException,
 )
 from torchx.util.types import to_dict
 
@@ -41,13 +41,13 @@ def _convert_to_option_type(
         return option_type(value)
 
 
-def _parse_run_config(arg: str, scheduler_run_opts: specs.runopts) -> specs.RunConfig:
+def _parse_run_config(arg: str, scheduler_opts: specs.runopts) -> specs.RunConfig:
     conf = specs.RunConfig()
     if not arg:
         return conf
 
     for key, value in to_dict(arg).items():
-        option = scheduler_run_opts.get(key)
+        option = scheduler_opts.get(key)
         if option is None:
             raise ValueError(f"Unknown {key}, run `torchx runopts` for more info")
         option_type = option.opt_type
@@ -86,7 +86,7 @@ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
             default=get_default_scheduler_name(),
         )
         subparser.add_argument(
-            "-a",
+            "-cfg",
             "--scheduler_args",
             type=str,
             help="Arguments to pass to the scheduler (Ex:`cluster=foo,user=bar`)."
diff --git a/torchx/runner/api.py b/torchx/runner/api.py
index 89fe009bb..4da92d365 100644
--- a/torchx/runner/api.py
+++ b/torchx/runner/api.py
@@ -263,7 +263,7 @@ def dryrun(
 
         cfg = cfg or RunConfig()
         # TODO enable profiles - https://github.com/pytorch/torchx/issues/248
-        config.apply(profile="default", scheduler=scheduler, runcfg=cfg)
+        config.apply(scheduler=scheduler, cfg=cfg, profile="default")
 
         sched = self._scheduler(scheduler)
         sched._validate(app, scheduler)
diff --git a/torchx/runner/config.py b/torchx/runner/config.py
index 40022e4d8..d68613555 100644
--- a/torchx/runner/config.py
+++ b/torchx/runner/config.py
@@ -12,6 +12,7 @@
 
 from torchx.schedulers import Scheduler, get_schedulers
 from torchx.specs import RunConfig, get_type_name
+from torchx.specs.api import runopt
 
 
 _NONE = "None"
@@ -47,22 +48,36 @@ def _get_scheduler(name: str) -> Scheduler:
     return sched
 
 
+def _fixme_placeholder(runopt: runopt, max_len: int = 60) -> str:
+    ph = f"#FIXME:({get_type_name(runopt.opt_type)}) {runopt.help}"
+    return ph if len(ph) <= max_len else f"{ph[:max_len]}..."
+
+
 def dump(
     f: TextIO, schedulers: Optional[List[str]] = None, required_only: bool = False
 ) -> None:
     """
-    Dumps a default INI-style config template containing the runopts for the
-    given scheduler names into ``f``. If no ``schedulers`` are specified
-    dumps all known registered schedulers.
+    Dumps a default INI-style config template containing the :py:class:torchx.specs.runopts for the
+    given scheduler names into the file-like object specified by ``f``.
+    If no ``schedulers`` are specified dumps all known registered schedulers.
 
     Optional runopts are pre-filled  with their default values.
-    Required runopts are set with a ``<FIXME_...>`` placeholder.
+    Required runopts are set with a ``FIXME: ...`` placeholder.
+    To only dump required runopts pass ``required_only=True``.
+
     Each scheduler's runopts are written in the section called
-    ``[default.scheduler_args.{scheduler_name}]`` (e.g. ``[default.scheduler_args.kubernetes]``)
+    ``[default.{scheduler_name}.cfg]``.
 
-    To only dump required runopts pass ``required_only=True``.
+    For example:
+
+    ::
 
-    Raises a ``ValueError`` if given a scheduler name that is not known
+     [default.kubernetes.cfg]
+     namespace = default
+     queue = #FIXME (str)Volcano queue to schedule job in
+
+    Raises:
+        ``ValueError`` - if given a scheduler name that is not known
     """
 
     if schedulers:
@@ -74,12 +89,12 @@ def dump(
     for sched_name in scheds:
         sched = _get_scheduler(sched_name)
 
-        section = f"default.scheduler_args.{sched_name}"
+        section = f"default.{sched_name}.cfg"
         config.add_section(section)
 
         for opt_name, opt in sched.run_opts():
             if opt.is_required:
-                val = f"<FIXME_WITH_A_{get_type_name(opt.opt_type)}_VALUE>"
+                val = _fixme_placeholder(opt)
             else:  # not required runopts MUST have a default
                 if required_only:
                     continue
@@ -96,11 +111,10 @@ def dump(
                     val = f"{opt.default}"
 
             config.set(section, opt_name, val)
-
     config.write(f, space_around_delimiters=True)
 
 
-def apply(profile: str, scheduler: str, runcfg: RunConfig) -> None:
+def apply(scheduler: str, cfg: RunConfig, profile: str = "default") -> None:
     """
     Loads .torchxconfig files from predefined locations according
     to a load hierarchy and applies the loaded configs into the
@@ -121,10 +135,10 @@ def apply(profile: str, scheduler: str, runcfg: RunConfig) -> None:
         if configfile.exists():
             log.info(f"loading configs from {configfile}")
             with open(str(configfile), "r") as f:
-                load(profile, scheduler, f, runcfg)
+                load(scheduler, f, cfg, profile)
 
 
-def load(profile: str, scheduler: str, f: TextIO, runcfg: RunConfig) -> None:
+def load(scheduler: str, f: TextIO, cfg: RunConfig, profile: str = "default") -> None:
     """
     loads the section ``[{profile}.scheduler_args.{scheduler}]`` from the given
     configfile ``f`` (in .INI format) into the provided ``runcfg``, only adding
@@ -137,17 +151,17 @@ def load(profile: str, scheduler: str, f: TextIO, runcfg: RunConfig) -> None:
 
     runopts = _get_scheduler(scheduler).run_opts()
 
-    section = f"{profile}.scheduler_args.{scheduler}"
+    section = f"{profile}.{scheduler}.cfg"
     if config.has_section(section):
         for name, value in config.items(section):
-            if name in runcfg.cfgs:
+            if name in cfg.cfgs:
                 # DO NOT OVERRIDE existing configs
                 continue
 
             if value == _NONE:
                 # should map to None (not str 'None')
                 # this also handles empty or None lists
-                runcfg.set(name, None)
+                cfg.set(name, None)
             else:
                 runopt = runopts.get(name)
 
@@ -161,9 +175,9 @@ def load(profile: str, scheduler: str, f: TextIO, runcfg: RunConfig) -> None:
                     if runopt.opt_type is bool:
                         # need to handle bool specially since str -> bool is based on
                         # str emptiness not value (e.g. bool("False") == True)
-                        runcfg.set(name, config.getboolean(section, name))
+                        cfg.set(name, config.getboolean(section, name))
                     elif runopt.opt_type is List[str]:
-                        runcfg.set(name, value.split(";"))
+                        cfg.set(name, value.split(";"))
                     else:
                         # pyre-ignore[29]
-                        runcfg.set(name, runopt.opt_type(value))
+                        cfg.set(name, runopt.opt_type(value))
diff --git a/torchx/runner/test/config_test.py b/torchx/runner/test/config_test.py
index b936366f2..ad34ba0cb 100644
--- a/torchx/runner/test/config_test.py
+++ b/torchx/runner/test/config_test.py
@@ -102,30 +102,30 @@ def run_opts(self) -> runopts:
         return opts
 
 
-_CONFIG = """[default.scheduler_args.local_cwd]
+_CONFIG = """[default.local_cwd.cfg]
 log_dir = /home/bob/logs
 prepend_cwd = True
 
-[test.scheduler_args.local_cwd]
+[test.local_cwd.cfg]
 log_dir = None
 prepend_cwd = False
 
-[alpha.scheduler_args.local_cwd]
+[alpha.local_cwd.cfg]
 log_dir = /tmp/logs
 """
 
-_CONFIG_INVALID = """[default.scheduler_args.test]
+_CONFIG_INVALID = """[default.test.cfg]
 a_run_opt_that = does_not_exist
 s = option_that_exists
 """
 
-_TEAM_CONFIG = """[default.scheduler_args.test]
+_TEAM_CONFIG = """[default.test.cfg]
 s = team_default
 i = 50
 f = 1.2
 """
 
-_MY_CONFIG = """[default.scheduler_args.test]
+_MY_CONFIG = """[default.test.cfg]
 s = my_default
 i = 100
 """
@@ -158,32 +158,30 @@ def _write(self, filename: str, content: str) -> Path:
         return f
 
     def test_load(self) -> None:
-        runcfg = RunConfig()
-        load(
-            profile="default", scheduler="local_cwd", f=StringIO(_CONFIG), runcfg=runcfg
-        )
-        self.assertEqual("/home/bob/logs", runcfg.get("log_dir"))
-        self.assertEqual(True, runcfg.get("prepend_cwd"))
+        cfg = RunConfig()
+        load(profile="default", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg)
+        self.assertEqual("/home/bob/logs", cfg.get("log_dir"))
+        self.assertEqual(True, cfg.get("prepend_cwd"))
 
-        runcfg = RunConfig()
-        load(profile="test", scheduler="local_cwd", f=StringIO(_CONFIG), runcfg=runcfg)
-        self.assertEqual(None, runcfg.get("log_dir"))
-        self.assertEqual(False, runcfg.get("prepend_cwd"))
+        cfg = RunConfig()
+        load(profile="test", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg)
+        self.assertEqual(None, cfg.get("log_dir"))
+        self.assertEqual(False, cfg.get("prepend_cwd"))
 
-        runcfg = RunConfig()
-        load(profile="alpha", scheduler="local_cwd", f=StringIO(_CONFIG), runcfg=runcfg)
-        self.assertEqual("/tmp/logs", runcfg.get("log_dir"))
-        self.assertEqual(None, runcfg.get("prepend_cwd"))
+        cfg = RunConfig()
+        load(profile="alpha", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg)
+        self.assertEqual("/tmp/logs", cfg.get("log_dir"))
+        self.assertEqual(None, cfg.get("prepend_cwd"))
 
     def test_no_override_load(self) -> None:
-        runcfg = RunConfig()
-        runcfg.set("log_dir", "/foo/bar")
-        runcfg.set("debug", 1)
+        cfg = RunConfig()
+        cfg.set("log_dir", "/foo/bar")
+        cfg.set("debug", 1)
 
-        load(profile="test", scheduler="local_cwd", f=StringIO(_CONFIG), runcfg=runcfg)
-        self.assertEqual("/foo/bar", runcfg.get("log_dir"))
-        self.assertEqual(1, runcfg.get("debug"))
-        self.assertEqual(False, runcfg.get("prepend_cwd"))
+        load(profile="test", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg)
+        self.assertEqual("/foo/bar", cfg.get("log_dir"))
+        self.assertEqual(1, cfg.get("debug"))
+        self.assertEqual(False, cfg.get("prepend_cwd"))
 
     @patch(
         TORCHX_GET_SCHEDULERS,
@@ -192,14 +190,14 @@ def test_no_override_load(self) -> None:
     def test_apply(self, _) -> None:
         with patch(PATH_CWD, return_value=Path(self.test_dir)):
             with patch(PATH_HOME, return_value=Path(self.test_dir) / "home" / "bob"):
-                runcfg = RunConfig()
-                runcfg.set("s", "runtime_value")
+                cfg = RunConfig()
+                cfg.set("s", "runtime_value")
 
-                apply(profile="default", scheduler="test", runcfg=runcfg)
+                apply(profile="default", scheduler="test", cfg=cfg)
 
-                self.assertEqual("runtime_value", runcfg.get("s"))
-                self.assertEqual(100, runcfg.get("i"))
-                self.assertEqual(1.2, runcfg.get("f"))
+                self.assertEqual("runtime_value", cfg.get("s"))
+                self.assertEqual(100, cfg.get("i"))
+                self.assertEqual(1.2, cfg.get("f"))
 
     def test_dump_invalid_scheduler(self) -> None:
         with self.assertRaises(ValueError):
@@ -215,50 +213,50 @@ def test_dump_only_required(self, _) -> None:
         # test scheduler has no required options hence expect empty string
         dump(f=sfile, required_only=True)
 
-        runcfg = RunConfig()
+        cfg = RunConfig()
         sfile.seek(0)
-        load(profile="default", scheduler="test", f=sfile, runcfg=runcfg)
+        load(profile="default", scheduler="test", f=sfile, cfg=cfg)
 
-        self.assertFalse(runcfg.cfgs)
+        self.assertFalse(cfg.cfgs)
 
     @patch(
         TORCHX_GET_SCHEDULERS,
         return_value={"test": TestScheduler()},
     )
     def test_load_invalid_runopt(self, _) -> None:
-        runcfg = RunConfig()
+        cfg = RunConfig()
         load(
             profile="default",
             scheduler="test",
             f=StringIO(_CONFIG_INVALID),
-            runcfg=runcfg,
+            cfg=cfg,
         )
         # options in the config file but not in runopts
         # should be ignored (we shouldn't throw an error since
         # this makes things super hard to guarantee BC - stale config file will fail
         # to run, we don't want that)
 
-        self.assertEquals("option_that_exists", runcfg.get("s"))
+        self.assertEquals("option_that_exists", cfg.get("s"))
 
     def test_load_no_section(self) -> None:
-        runcfg = RunConfig()
+        cfg = RunConfig()
         load(
             profile="default",
             scheduler="local_cwd",
             f=StringIO(),
-            runcfg=runcfg,
+            cfg=cfg,
         )
         # is empty
-        self.assertFalse(runcfg.cfgs)
+        self.assertFalse(cfg.cfgs)
 
         load(
             profile="default",
             scheduler="local_cwd",
             f=StringIO("[default.scheduler_args.local_cwd]\n"),
-            runcfg=runcfg,
+            cfg=cfg,
         )
         # still empty
-        self.assertFalse(runcfg.cfgs)
+        self.assertFalse(cfg.cfgs)
 
     @patch(
         TORCHX_GET_SCHEDULERS,
@@ -270,12 +268,12 @@ def test_dump_and_load_all_runopt_types(self, _) -> None:
 
         sfile.seek(0)
 
-        runcfg = RunConfig()
-        load(profile="default", scheduler="test", f=sfile, runcfg=runcfg)
+        cfg = RunConfig()
+        load(profile="default", scheduler="test", f=sfile, cfg=cfg)
 
         # all runopts in the TestScheduler have defaults, just check against those
         for opt_name, opt in TestScheduler().run_opts():
-            self.assertEqual(runcfg.get(opt_name), opt.default)
+            self.assertEqual(cfg.get(opt_name), opt.default)
 
     def test_dump_and_load_all_registered_schedulers(self) -> None:
         # dump all the runopts for all registered schedulers
@@ -284,11 +282,11 @@ def test_dump_and_load_all_registered_schedulers(self) -> None:
 
         sfile = StringIO()
         dump(sfile)
-
+        print(sfile.getvalue())
         for sched_name, sched in get_schedulers(session_name="_").items():
             sfile.seek(0)  # reset the file pos
-            runcfg = RunConfig()
-            load(profile="default", scheduler=sched_name, f=sfile, runcfg=runcfg)
+            cfg = RunConfig()
+            load(profile="default", scheduler=sched_name, f=sfile, cfg=cfg)
 
             for opt_name, _ in sched.run_opts():
-                self.assertTrue(opt_name in runcfg.cfgs)
+                self.assertTrue(opt_name in cfg.cfgs)
diff --git a/torchx/specs/__init__.py b/torchx/specs/__init__.py
index f061eb4d8..f2e11b71c 100644
--- a/torchx/specs/__init__.py
+++ b/torchx/specs/__init__.py
@@ -11,38 +11,40 @@
 from torchx.util.entrypoints import load_group
 
 from .api import (  # noqa: F401 F403
-    SchedulerBackend,
-    Resource,
-    NULL_RESOURCE,
     ALL,
     MISSING,
     NONE,
-    macros,
-    RetryPolicy,
-    Role,
+    NULL_RESOURCE,
     AppDef,
+    AppDryRunInfo,
+    AppHandle,
     AppState,
-    is_terminal,
-    ReplicaStatus,
-    ReplicaState,
-    RoleStatus,
     AppStatus,
     ConfigValue,
-    RunConfig,
-    AppDryRunInfo,
-    get_type_name,
-    runopts,
     InvalidRunConfigException,
     MalformedAppHandleException,
-    UnknownSchedulerException,
-    AppHandle,
+    ReplicaState,
+    ReplicaStatus,
+    Resource,
+    RetryPolicy,
+    Role,
+    RoleStatus,
+    RunConfig,
+    SchedulerBackend,
     UnknownAppException,
+    UnknownSchedulerException,
+    from_function,
+    get_argparse_param_type,
+    get_type_name,
+    is_terminal,
+    macros,
     make_app_handle,
     parse_app_handle,
-    get_argparse_param_type,
-    from_function,
+    runopt,
+    runopts,
 )
 
+
 GiB: int = 1024
 
 
diff --git a/torchx/specs/api.py b/torchx/specs/api.py
index cb9bf354f..d068ae010 100644
--- a/torchx/specs/api.py
+++ b/torchx/specs/api.py
@@ -509,7 +509,7 @@ def get_type_name(tp: Type[ConfigValue]) -> str:
 
 
 @dataclass
-class Runopt:
+class runopt:
     """
     Represents the metadata about the specific run option
     """
@@ -554,9 +554,9 @@ class runopts:
     """
 
     def __init__(self) -> None:
-        self._opts: Dict[str, Runopt] = {}
+        self._opts: Dict[str, runopt] = {}
 
-    def __iter__(self) -> Iterator[Tuple[str, Runopt]]:
+    def __iter__(self) -> Iterator[Tuple[str, runopt]]:
         return self._opts.items().__iter__()
 
     @staticmethod
@@ -573,7 +573,7 @@ def is_type(obj: ConfigValue, tp: Type[ConfigValue]) -> bool:
             else:
                 return False
 
-    def get(self, name: str) -> Optional[Runopt]:
+    def get(self, name: str) -> Optional[runopt]:
         """
         Returns option if any was registered, or None otherwise
         """
@@ -603,7 +603,7 @@ def add(
                     f" Given: {default} ({type(default).__name__})"
                 )
 
-        self._opts[cfg_key] = Runopt(default, type_, required, help)
+        self._opts[cfg_key] = runopt(default, type_, required, help)
 
     def resolve(self, config: RunConfig) -> RunConfig:
         """
@@ -623,9 +623,8 @@ def resolve(self, config: RunConfig) -> RunConfig:
             # check required opt
             if runopt.is_required and val is None:
                 raise InvalidRunConfigException(
-                    f"Required run option: {cfg_key}, must be provided and not None",
+                    f"Required run option: {cfg_key}, must be provided and not `None`",
                     config,
-                    self,
                 )
 
             # check type (None matches all types)
@@ -634,7 +633,6 @@ def resolve(self, config: RunConfig) -> RunConfig:
                     f"Run option: {cfg_key}, must be of type: {get_type_name(runopt.opt_type)},"
                     f" but was: {val} ({type(val).__name__})",
                     config,
-                    self,
                 )
 
             # not required and not set, set to default
@@ -678,10 +676,9 @@ class InvalidRunConfigException(Exception):
     type mismatch.
     """
 
-    def __init__(
-        self, invalid_reason: str, run_config: RunConfig, runopts: "runopts"
-    ) -> None:
-        super().__init__(f"{invalid_reason}. Given: {run_config}, Expected: {runopts}")
+    def __init__(self, invalid_reason: str, cfg: RunConfig) -> None:
+        given = str(cfg) if cfg.cfgs else "<EMPTY>"
+        super().__init__(f"{invalid_reason}. Given: {given}")
 
 
 class MalformedAppHandleException(Exception):

From 95ea9f53e29e399522cd18f12e4bcf81781ea82e Mon Sep 17 00:00:00 2001
From: Kiuk Chung <kiuk@fb.com>
Date: Thu, 14 Oct 2021 16:05:58 -0700
Subject: [PATCH 04/14] (torchx/cli) fix misaligned log msgs for torchx run

Summary: closes: https://github.com/pytorch/torchx/issues/251

Reviewed By: d4l3k

Differential Revision: D31659813

fbshipit-source-id: 150ea152339adf84d19f1eb8b4a2e083901705ab
---
 torchx/cli/cmd_run.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/torchx/cli/cmd_run.py b/torchx/cli/cmd_run.py
index 1d1189db2..1a800e0a4 100644
--- a/torchx/cli/cmd_run.py
+++ b/torchx/cli/cmd_run.py
@@ -66,9 +66,9 @@ def _builtins(self) -> Dict[str, _Component]:
     def run(self, args: argparse.Namespace) -> None:
         builtin_components = self._builtins()
         num_builtins = len(builtin_components)
-        logger.info(f"Found {num_builtins} builtin configs:")
+        print(f"Found {num_builtins} builtin configs:")
         for i, component in enumerate(builtin_components.values()):
-            logger.info(f" {i + 1:2d}. {component.name}")
+            print(f" {i + 1:2d}. {component.name}")
 
 
 class CmdRun(SubCommand):
@@ -139,11 +139,12 @@ def _run(self, runner: Runner, args: argparse.Namespace) -> Optional[str]:
 
         if args.dryrun:
             app_dryrun_info = cast(specs.AppDryRunInfo, result)
-            logger.info("=== APPLICATION ===")
-            logger.info(pformat(asdict(app_dryrun_info._app), indent=2, width=80))
+            logger.info(
+                "\n=== APPLICATION ===\n"
+                f"{pformat(asdict(app_dryrun_info._app), indent=2, width=80)}"
+            )
 
-            logger.info("=== SCHEDULER REQUEST ===")
-            logger.info(app_dryrun_info)
+            logger.info("\n=== SCHEDULER REQUEST ===\n" f"{app_dryrun_info}")
             return
         else:
             app_handle = cast(specs.AppHandle, result)
@@ -153,7 +154,6 @@ def _run(self, runner: Runner, args: argparse.Namespace) -> Optional[str]:
             if args.scheduler.startswith("local"):
                 self._wait_and_exit(runner, app_handle)
             else:
-                logger.info("=== RUN RESULT ===")
                 logger.info(f"Launched app: {app_handle}")
                 status = runner.status(app_handle)
                 logger.info(status)

From 6f91834b597f379d04b725570c69c7c0f51ee819 Mon Sep 17 00:00:00 2001
From: Aliaksandr Ivanou <aivanou@fb.com>
Date: Thu, 14 Oct 2021 18:40:55 -0700
Subject: [PATCH 05/14] Addressed comments

---
 torchx/components/interpret.py                            | 2 +-
 torchx/examples/apps/lightning_classy_vision/component.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchx/components/interpret.py b/torchx/components/interpret.py
index 178da14e9..030185457 100644
--- a/torchx/components/interpret.py
+++ b/torchx/components/interpret.py
@@ -16,6 +16,6 @@
 See the
 :ref:`examples_apps/lightning_classy_vision/interpret:Model Interpretability App Example`
 and the corresponding
-:ref:`Interpret component definition<examples_apps/lightning_classy_vision/component:Model Interpret>`
+:ref:`Interpret component definition<examples_apps/lightning_classy_vision/component:Interpreting the Model>`
 for an example of how to use Captum with TorchX.
 """
diff --git a/torchx/examples/apps/lightning_classy_vision/component.py b/torchx/examples/apps/lightning_classy_vision/component.py
index 4e069e066..300f63e10 100644
--- a/torchx/examples/apps/lightning_classy_vision/component.py
+++ b/torchx/examples/apps/lightning_classy_vision/component.py
@@ -258,9 +258,9 @@ def trainer_dist(
 
 
 # %%
-# Model Interpret
-# #################
-# Defines interpret component
+# Interpreting the Model
+# #######################
+# Defines a component that interprets the model
 #
 # Train a single trainer example: :ref:`examples_apps/lightning_classy_vision/component:Single Trainer Component`
 # And use the following cmd to try out:

From 71958727d36c51fd65d9b0d790b47e3a97ff964a Mon Sep 17 00:00:00 2001
From: Tristan Rice <rice@fn.lc>
Date: Thu, 14 Oct 2021 19:24:42 -0700
Subject: [PATCH 06/14] CI: use OIDC (#256)

Summary:
This switches our integration tests to use the GitHub OpenID Connect credentials provider instead of using hard coded AWS session tokens. This will issue tokens that last for 1 hour so should be a lot more secure (and trackable) than before.

https://awsteele.com/blog/2021/09/15/aws-federation-comes-to-github-actions.html

Pull Request resolved: https://github.com/pytorch/torchx/pull/256

Test Plan:
CI

created PR from external repo to verify they can't generate tokens https://github.com/pytorch/torchx/pull/257

Reviewed By: kiukchung

Differential Revision: D31674489

Pulled By: d4l3k

fbshipit-source-id: 5936c64794816eb9fafe76899af44e2f865c64df
---
 .../components-integration-tests.yaml         | 29 +++++++++++------
 .github/workflows/kfp-integration-tests.yaml  | 29 +++++++++++------
 ...bernetes-dist-train-integration-tests.yaml | 31 ++++++++++++-------
 3 files changed, 58 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/components-integration-tests.yaml b/.github/workflows/components-integration-tests.yaml
index 006812636..8ee7b8e7d 100644
--- a/.github/workflows/components-integration-tests.yaml
+++ b/.github/workflows/components-integration-tests.yaml
@@ -9,6 +9,9 @@ on:
 jobs:
   components-launch:
     runs-on: ubuntu-18.04
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup Python
         uses: actions/setup-python@v2
@@ -17,22 +20,30 @@ jobs:
           architecture: x64
       - name: Checkout TorchX
         uses: actions/checkout@v2
-      - name: Configure Kube Config
+      - name: Configure AWS
         env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }}
+        run: |
+          if [ -n "$AWS_ROLE_ARN" ]; then
+            export AWS_WEB_IDENTITY_TOKEN_FILE=/tmp/awscreds
+            export AWS_DEFAULT_REGION=us-west-2
+
+            echo AWS_WEB_IDENTITY_TOKEN_FILE=$AWS_WEB_IDENTITY_TOKEN_FILE >> $GITHUB_ENV
+            echo AWS_ROLE_ARN=$AWS_ROLE_ARN >> $GITHUB_ENV
+            echo AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION >> $GITHUB_ENV
+
+            curl -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" "$ACTIONS_ID_TOKEN_REQUEST_URL" | jq -r '.value' > $AWS_WEB_IDENTITY_TOKEN_FILE
+          fi
+      - name: Configure Kube Config
         run: |
           set -eux
-          if [ -n "$AWS_ACCESS_KEY_ID" ]; then
+          if [ -n "$AWS_ROLE_ARN" ]; then
             aws eks update-kubeconfig --region=us-west-2 --name=${{ secrets.EKS_CLUSTER_NAME }}
           fi
       - name: Configure Docker
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
         run: |
           set -eux
-          if [ -n "$AWS_ACCESS_KEY_ID" ]; then
+          if [ -n "$AWS_ROLE_ARN" ]; then
             aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 495572122715.dkr.ecr.us-west-2.amazonaws.com
           fi
       - name: Install dependencies
@@ -42,8 +53,6 @@ jobs:
           pip install -e .[kubernetes]
       - name: Run Components Integration Tests
         env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
           INTEGRATION_TEST_STORAGE: ${{ secrets.INTEGRATION_TEST_STORAGE }}
           CONTAINER_REPO: ${{ secrets.CONTAINER_REPO }}
         run: scripts/component_integration_tests.py
diff --git a/.github/workflows/kfp-integration-tests.yaml b/.github/workflows/kfp-integration-tests.yaml
index 53591852e..c1f386f7f 100644
--- a/.github/workflows/kfp-integration-tests.yaml
+++ b/.github/workflows/kfp-integration-tests.yaml
@@ -9,6 +9,9 @@ on:
 jobs:
   kfp-launch:
     runs-on: ubuntu-18.04
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Install kubectl
         # More info: https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/
@@ -18,13 +21,24 @@ jobs:
           mkdir -p ~/.local/bin/kubectl
           mv ./kubectl ~/.local/bin/kubectl
           export PATH=$PATH:~/.local/bin/kubectl
-      - name: Configure Kube Config
+      - name: Configure AWS
         env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }}
+        run: |
+          if [ -n "$AWS_ROLE_ARN" ]; then
+            export AWS_WEB_IDENTITY_TOKEN_FILE=/tmp/awscreds
+            export AWS_DEFAULT_REGION=us-west-2
+
+            echo AWS_WEB_IDENTITY_TOKEN_FILE=$AWS_WEB_IDENTITY_TOKEN_FILE >> $GITHUB_ENV
+            echo AWS_ROLE_ARN=$AWS_ROLE_ARN >> $GITHUB_ENV
+            echo AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION >> $GITHUB_ENV
+
+            curl -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" "$ACTIONS_ID_TOKEN_REQUEST_URL" | jq -r '.value' > $AWS_WEB_IDENTITY_TOKEN_FILE
+          fi
+      - name: Configure Kube Config
         run: |
           set -eux
-          if [ -n "$AWS_ACCESS_KEY_ID" ]; then
+          if [ -n "$AWS_ROLE_ARN" ]; then
             aws eks update-kubeconfig --region=us-west-2 --name=${{ secrets.EKS_CLUSTER_NAME }}
           fi
       - name: Setup Python
@@ -35,12 +49,9 @@ jobs:
       - name: Checkout TorchX
         uses: actions/checkout@v2
       - name: Configure Docker
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
         run: |
           set -eux
-          if [ -n "$AWS_ACCESS_KEY_ID" ]; then
+          if [ -n "$AWS_ROLE_ARN" ]; then
             aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 495572122715.dkr.ecr.us-west-2.amazonaws.com
           fi
       - name: Install dependencies
@@ -50,8 +61,6 @@ jobs:
           python setup.py install
       - name: Run KFP Integration Tests
         env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
           KFP_NAMESPACE: ${{ secrets.KFP_NAMESPACE }}
           INTEGRATION_TEST_STORAGE: ${{ secrets.INTEGRATION_TEST_STORAGE }}
           CONTAINER_REPO: ${{ secrets.CONTAINER_REPO }}
diff --git a/.github/workflows/kubernetes-dist-train-integration-tests.yaml b/.github/workflows/kubernetes-dist-train-integration-tests.yaml
index 8088d5baa..87b100da8 100644
--- a/.github/workflows/kubernetes-dist-train-integration-tests.yaml
+++ b/.github/workflows/kubernetes-dist-train-integration-tests.yaml
@@ -9,6 +9,9 @@ on:
 jobs:
   kubernetes-launch:
     runs-on: ubuntu-18.04
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup Python
         uses: actions/setup-python@v2
@@ -17,22 +20,30 @@ jobs:
           architecture: x64
       - name: Checkout TorchX
         uses: actions/checkout@v2
-      - name: Configure Kube Config
+      - name: Configure AWS
         env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }}
+        run: |
+          if [ -n "$AWS_ROLE_ARN" ]; then
+            export AWS_WEB_IDENTITY_TOKEN_FILE=/tmp/awscreds
+            export AWS_DEFAULT_REGION=us-west-2
+
+            echo AWS_WEB_IDENTITY_TOKEN_FILE=$AWS_WEB_IDENTITY_TOKEN_FILE >> $GITHUB_ENV
+            echo AWS_ROLE_ARN=$AWS_ROLE_ARN >> $GITHUB_ENV
+            echo AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION >> $GITHUB_ENV
+
+            curl -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" "$ACTIONS_ID_TOKEN_REQUEST_URL" | jq -r '.value' > $AWS_WEB_IDENTITY_TOKEN_FILE
+          fi
+      - name: Configure Kube Config
         run: |
           set -eux
-          if [ -n "$AWS_ACCESS_KEY_ID" ]; then
+          if [ -n "$AWS_ROLE_ARN" ]; then
             aws eks update-kubeconfig --region=us-west-2 --name=${{ secrets.EKS_CLUSTER_NAME }}
           fi
       - name: Configure Docker
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
         run: |
           set -eux
-          if [ -n "$AWS_ACCESS_KEY_ID" ]; then
+          if [ -n "$AWS_ROLE_ARN" ]; then
             aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 495572122715.dkr.ecr.us-west-2.amazonaws.com
           fi
       - name: Install dependencies
@@ -41,12 +52,10 @@ jobs:
           pip install -e .[kubernetes]
       - name: Run Kubernetes Integration Tests
         env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
           INTEGRATION_TEST_STORAGE: ${{ secrets.INTEGRATION_TEST_STORAGE }}
           CONTAINER_REPO: ${{ secrets.CONTAINER_REPO }}
         run: |
-          if [ -z "$AWS_ACCESS_KEY_ID" ]; then
+          if [ -z "$AWS_ROLE_ARN" ]; then
             # only dryrun if no secrets
             ARGS="--dryrun"
           else

From 37311e47ef5f2c9ce956e224e9b568c5cf2912d2 Mon Sep 17 00:00:00 2001
From: Aliaksandr Ivanou <aivanou@fb.com>
Date: Thu, 14 Oct 2021 19:30:50 -0700
Subject: [PATCH 07/14] Make `output_path` optional

Summary: Since we removed distributed sum, we need to use this example to run fb internal tests. For internal tests, we don't need the `output_path`, which introduces around ~200 mb of data on each run

Reviewed By: kiukchung

Differential Revision: D31661378

fbshipit-source-id: 098bf9f5be9302e7d8cced672ba9cf7eaf8b32e6
---
 .../apps/lightning_classy_vision/train.py     | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/torchx/examples/apps/lightning_classy_vision/train.py b/torchx/examples/apps/lightning_classy_vision/train.py
index 0db67f097..57bfade7c 100755
--- a/torchx/examples/apps/lightning_classy_vision/train.py
+++ b/torchx/examples/apps/lightning_classy_vision/train.py
@@ -21,7 +21,7 @@
 import os
 import sys
 import tempfile
-from typing import List
+from typing import List, Optional
 
 import pytorch_lightning as pl
 import torch
@@ -72,8 +72,7 @@ def parse_args(argv: List[str]) -> argparse.Namespace:
     parser.add_argument(
         "--output_path",
         type=str,
-        help="path to place checkpoints and model outputs",
-        required=True,
+        help="path to place checkpoints and model outputs, if not specified, checkpoints are not saved",
     )
     parser.add_argument(
         "--log_path",
@@ -94,6 +93,16 @@ def get_gpu_devices() -> int:
     return torch.cuda.device_count()
 
 
+def get_model_checkpoint(args: argparse.Namespace) -> Optional[ModelCheckpoint]:
+    if not args.output_path:
+        return None
+    return ModelCheckpoint(
+        monitor="train_loss",
+        dirpath=args.output_path,
+        save_last=True,
+    )
+
+
 def main(argv: List[str]) -> None:
     with tempfile.TemporaryDirectory() as tmpdir:
         args = parse_args(argv)
@@ -117,11 +126,10 @@ def main(argv: List[str]) -> None:
         )
 
         # Setup model checkpointing
-        checkpoint_callback = ModelCheckpoint(
-            monitor="train_loss",
-            dirpath=args.output_path,
-            save_last=True,
-        )
+        checkpoint_callback = get_model_checkpoint(args)
+        callbacks = []
+        if checkpoint_callback:
+            callbacks.append(checkpoint_callback)
         if args.load_path:
             print(f"loading checkpoint: {args.load_path}...")
             model.load_from_checkpoint(checkpoint_path=args.load_path)

From 19925a0beef6cb3783c623e3d2c8b533cd235f8c Mon Sep 17 00:00:00 2001
From: Aliaksandr Ivanou <aivanou@fb.com>
Date: Thu, 14 Oct 2021 19:59:36 -0700
Subject: [PATCH 08/14] Do not check `torchx.components.base` module for
 components (#258)

Summary:
Pull Request resolved: https://github.com/pytorch/torchx/pull/258

Do not check `torchx.components.base` module for components

Reviewed By: kiukchung

Differential Revision: D31664832

fbshipit-source-id: 3de72047810ff8c2478e036ce5626459d4c073af
---
 torchx/specs/finder.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/torchx/specs/finder.py b/torchx/specs/finder.py
index eb8ffd2b9..3dc2a74bd 100644
--- a/torchx/specs/finder.py
+++ b/torchx/specs/finder.py
@@ -119,9 +119,13 @@ def _get_components_from_dir(
         search_pattern = os.path.join(search_dir, "**", "*.py")
         component_defs = []
         for filepath in glob.glob(search_pattern, recursive=True):
-            module = self._try_load_module(
-                self._get_module_name(filepath, search_dir, base_module)
-            )
+            module_name = self._get_module_name(filepath, search_dir, base_module)
+            # TODO(aivanou): move `torchx.components.base` to `torchx.specs`, since
+            # there is nothing related to components in `torchx.components.base`
+            # see https://github.com/pytorch/torchx/issues/261
+            if module_name.startswith("torchx.components.base"):
+                continue
+            module = self._try_load_module(module_name)
             defs = self._get_components_from_module(base_module, module)
             component_defs += defs
         return component_defs

From 29f1e5e19c67c7495959856fd1cd367a33dc62ec Mon Sep 17 00:00:00 2001
From: Kiuk Chung <kiuk@fb.com>
Date: Thu, 14 Oct 2021 23:26:41 -0700
Subject: [PATCH 09/14] (torchx/config) remove profiles from .torchxconfig,
 remove hierarchical loading, and move config loading to cli from runner
 (#260)

Summary:
Pull Request resolved: https://github.com/pytorch/torchx/pull/260

1. Removes profiles from .torchxconfig (also removes .cfg suffix from section)
2. Removes hierarchical loading (only picks up .torchxconfig from CWD - project dir)
3. Removes config application from runner and moves it to CLI only

Reviewed By: d4l3k

Differential Revision: D31674537

fbshipit-source-id: 937c3375771316b2bf2f1d65a560d7311031d4fa
---
 torchx/cli/cmd_run.py             |  9 ++--
 torchx/runner/api.py              |  4 --
 torchx/runner/config.py           | 56 +++++++++++++++--------
 torchx/runner/test/config_test.py | 76 ++++++++++++++-----------------
 4 files changed, 78 insertions(+), 67 deletions(-)

diff --git a/torchx/cli/cmd_run.py b/torchx/cli/cmd_run.py
index 1a800e0a4..32e5b8f01 100644
--- a/torchx/cli/cmd_run.py
+++ b/torchx/cli/cmd_run.py
@@ -15,7 +15,7 @@
 import torchx.specs as specs
 from pyre_extensions import none_throws
 from torchx.cli.cmd_base import SubCommand
-from torchx.runner import Runner, get_runner
+from torchx.runner import Runner, config, get_runner
 from torchx.schedulers import get_default_scheduler_name, get_scheduler_factories
 from torchx.specs.finder import (
     ComponentNotFoundException,
@@ -53,6 +53,7 @@ def _parse_run_config(arg: str, scheduler_opts: specs.runopts) -> specs.RunConfi
         option_type = option.opt_type
         typed_value = _convert_to_option_type(value, option_type)
         conf.set(key, typed_value)
+
     return conf
 
 
@@ -114,7 +115,9 @@ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
     def _run(self, runner: Runner, args: argparse.Namespace) -> Optional[str]:
         run_opts = get_runner().run_opts()
         scheduler_opts = run_opts[args.scheduler]
-        scheduler_args = _parse_run_config(args.scheduler_args, scheduler_opts)
+        cfg = _parse_run_config(args.scheduler_args, scheduler_opts)
+        config.apply(scheduler=args.scheduler, cfg=cfg)
+
         if len(args.conf_args) < 1:
             none_throws(self._subparser).error(
                 "the following arguments are required: conf_file, conf_args"
@@ -129,7 +132,7 @@ def _run(self, runner: Runner, args: argparse.Namespace) -> Optional[str]:
                 conf_file,
                 conf_args,
                 args.scheduler,
-                scheduler_args,
+                cfg,
                 dryrun=args.dryrun,
             )
         except (ComponentValidationException, ComponentNotFoundException) as e:
diff --git a/torchx/runner/api.py b/torchx/runner/api.py
index 4da92d365..8e196a048 100644
--- a/torchx/runner/api.py
+++ b/torchx/runner/api.py
@@ -13,7 +13,6 @@
 from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union
 
 from pyre_extensions import none_throws
-from torchx.runner import config
 from torchx.runner.events import log_event
 from torchx.schedulers import get_schedulers
 from torchx.schedulers.api import Scheduler
@@ -262,9 +261,6 @@ def dryrun(
                 )
 
         cfg = cfg or RunConfig()
-        # TODO enable profiles - https://github.com/pytorch/torchx/issues/248
-        config.apply(scheduler=scheduler, cfg=cfg, profile="default")
-
         sched = self._scheduler(scheduler)
         sched._validate(app, scheduler)
         dryrun_info = sched.submit_dryrun(app, cfg)
diff --git a/torchx/runner/config.py b/torchx/runner/config.py
index d68613555..b02a2af63 100644
--- a/torchx/runner/config.py
+++ b/torchx/runner/config.py
@@ -72,7 +72,7 @@ def dump(
 
     ::
 
-     [default.kubernetes.cfg]
+     [kubernetes]
      namespace = default
      queue = #FIXME (str)Volcano queue to schedule job in
 
@@ -89,7 +89,7 @@ def dump(
     for sched_name in scheds:
         sched = _get_scheduler(sched_name)
 
-        section = f"default.{sched_name}.cfg"
+        section = f"{sched_name}"
         config.add_section(section)
 
         for opt_name, opt in sched.run_opts():
@@ -114,33 +114,51 @@ def dump(
     config.write(f, space_around_delimiters=True)
 
 
-def apply(scheduler: str, cfg: RunConfig, profile: str = "default") -> None:
+def apply(scheduler: str, cfg: RunConfig, dirs: Optional[List[str]] = None) -> None:
     """
-    Loads .torchxconfig files from predefined locations according
-    to a load hierarchy and applies the loaded configs into the
-    given ``runcfg``. The load hierarchy is as follows (in order of precedence):
+    Loads a ``.torchxconfig`` INI file from the specified directories in
+    preceding order and applies the run configs for the scheduler onto
+    the given ``cfg``.
 
-    #. ``runcfg`` given to this function
-    #. configs loaded from ``$HOME/.torchxconfig``
-    #. configs loaded from ``$CWD/.torchxconfig``
+    If no ``dirs`` is specified, then it looks for ``.torchxconfig`` in the
+    current working directory. If a specified directory does not have ``.torchxconfig``
+    then it is ignored.
 
-    Note that load hierarchy does NOT overwrite, but rather adds.
-    That is, the configs already present in ``runcfg`` are not
-    overridden during the load.
+    Note that the configs already present in the given ``cfg`` take precedence
+    over the ones in the config file and only new configs are added. The same holds
+    true for the configs loaded in list order.
+
+    For instance if ``cfg = {"foo": "bar"}`` and the config file is:
+
+    ::
+
+     # dir_1/.torchxconfig
+     [local_cwd]
+     foo = baz
+     hello = world
+
+    # dir_2/.torchxconfig
+    [local_cwd]
+    hello = bob
+
+
+    Then after the method call, ``cfg = {"foo": "bar", "hello": "world"}``.
     """
-    lookup_dirs = [Path.home(), Path.cwd()]
 
-    for d in lookup_dirs:
-        configfile = d / ".torchxconfig"
+    if not dirs:
+        dirs = [str(Path.cwd())]
+
+    for d in dirs:
+        configfile = Path(d) / ".torchxconfig"
         if configfile.exists():
             log.info(f"loading configs from {configfile}")
             with open(str(configfile), "r") as f:
-                load(scheduler, f, cfg, profile)
+                load(scheduler, f, cfg)
 
 
-def load(scheduler: str, f: TextIO, cfg: RunConfig, profile: str = "default") -> None:
+def load(scheduler: str, f: TextIO, cfg: RunConfig) -> None:
     """
-    loads the section ``[{profile}.scheduler_args.{scheduler}]`` from the given
+    loads the section ``[{scheduler}]`` from the given
     configfile ``f`` (in .INI format) into the provided ``runcfg``, only adding
     configs that are NOT currently in the given ``runcfg`` (e.g. does not
     override existing values in ``runcfg``). If no section is found, does nothing.
@@ -151,7 +169,7 @@ def load(scheduler: str, f: TextIO, cfg: RunConfig, profile: str = "default") ->
 
     runopts = _get_scheduler(scheduler).run_opts()
 
-    section = f"{profile}.{scheduler}.cfg"
+    section = f"{scheduler}"
     if config.has_section(section):
         for name, value in config.items(section):
             if name in cfg.cfgs:
diff --git a/torchx/runner/test/config_test.py b/torchx/runner/test/config_test.py
index ad34ba0cb..dc6de5782 100644
--- a/torchx/runner/test/config_test.py
+++ b/torchx/runner/test/config_test.py
@@ -102,35 +102,27 @@ def run_opts(self) -> runopts:
         return opts
 
 
-_CONFIG = """[default.local_cwd.cfg]
+_CONFIG = """[local_cwd]
 log_dir = /home/bob/logs
 prepend_cwd = True
-
-[test.local_cwd.cfg]
-log_dir = None
-prepend_cwd = False
-
-[alpha.local_cwd.cfg]
-log_dir = /tmp/logs
 """
 
-_CONFIG_INVALID = """[default.test.cfg]
+_CONFIG_INVALID = """[test]
 a_run_opt_that = does_not_exist
 s = option_that_exists
 """
 
-_TEAM_CONFIG = """[default.test.cfg]
+_TEAM_CONFIG = """[test]
 s = team_default
 i = 50
 f = 1.2
 """
 
-_MY_CONFIG = """[default.test.cfg]
+_MY_CONFIG = """[test]
 s = my_default
 i = 100
 """
 
-PATH_HOME = "torchx.runner.config.Path.home"
 PATH_CWD = "torchx.runner.config.Path.cwd"
 TORCHX_GET_SCHEDULERS = "torchx.runner.config.get_schedulers"
 
@@ -159,45 +151,50 @@ def _write(self, filename: str, content: str) -> Path:
 
     def test_load(self) -> None:
         cfg = RunConfig()
-        load(profile="default", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg)
+        load(scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg)
         self.assertEqual("/home/bob/logs", cfg.get("log_dir"))
         self.assertEqual(True, cfg.get("prepend_cwd"))
 
-        cfg = RunConfig()
-        load(profile="test", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg)
-        self.assertEqual(None, cfg.get("log_dir"))
-        self.assertEqual(False, cfg.get("prepend_cwd"))
-
-        cfg = RunConfig()
-        load(profile="alpha", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg)
-        self.assertEqual("/tmp/logs", cfg.get("log_dir"))
-        self.assertEqual(None, cfg.get("prepend_cwd"))
-
     def test_no_override_load(self) -> None:
         cfg = RunConfig()
         cfg.set("log_dir", "/foo/bar")
         cfg.set("debug", 1)
 
-        load(profile="test", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg)
+        load(scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg)
         self.assertEqual("/foo/bar", cfg.get("log_dir"))
         self.assertEqual(1, cfg.get("debug"))
-        self.assertEqual(False, cfg.get("prepend_cwd"))
+        self.assertEqual(True, cfg.get("prepend_cwd"))
 
     @patch(
         TORCHX_GET_SCHEDULERS,
         return_value={"test": TestScheduler()},
     )
-    def test_apply(self, _) -> None:
+    def test_apply_default(self, _) -> None:
         with patch(PATH_CWD, return_value=Path(self.test_dir)):
-            with patch(PATH_HOME, return_value=Path(self.test_dir) / "home" / "bob"):
-                cfg = RunConfig()
-                cfg.set("s", "runtime_value")
+            cfg = RunConfig()
+            cfg.set("s", "runtime_value")
+
+            apply(scheduler="test", cfg=cfg)
 
-                apply(profile="default", scheduler="test", cfg=cfg)
+            self.assertEqual("runtime_value", cfg.get("s"))
+            self.assertEqual(50, cfg.get("i"))
+            self.assertEqual(1.2, cfg.get("f"))
 
-                self.assertEqual("runtime_value", cfg.get("s"))
-                self.assertEqual(100, cfg.get("i"))
-                self.assertEqual(1.2, cfg.get("f"))
+    @patch(
+        TORCHX_GET_SCHEDULERS,
+        return_value={"test": TestScheduler()},
+    )
+    def test_apply_dirs(self, _) -> None:
+        cfg = RunConfig()
+        cfg.set("s", "runtime_value")
+        apply(
+            scheduler="test",
+            cfg=cfg,
+            dirs=[str(Path(self.test_dir) / "home" / "bob"), self.test_dir],
+        )
+        self.assertEqual("runtime_value", cfg.get("s"))
+        self.assertEqual(100, cfg.get("i"))
+        self.assertEqual(1.2, cfg.get("f"))
 
     def test_dump_invalid_scheduler(self) -> None:
         with self.assertRaises(ValueError):
@@ -215,7 +212,7 @@ def test_dump_only_required(self, _) -> None:
 
         cfg = RunConfig()
         sfile.seek(0)
-        load(profile="default", scheduler="test", f=sfile, cfg=cfg)
+        load(scheduler="test", f=sfile, cfg=cfg)
 
         self.assertFalse(cfg.cfgs)
 
@@ -226,7 +223,6 @@ def test_dump_only_required(self, _) -> None:
     def test_load_invalid_runopt(self, _) -> None:
         cfg = RunConfig()
         load(
-            profile="default",
             scheduler="test",
             f=StringIO(_CONFIG_INVALID),
             cfg=cfg,
@@ -241,7 +237,6 @@ def test_load_invalid_runopt(self, _) -> None:
     def test_load_no_section(self) -> None:
         cfg = RunConfig()
         load(
-            profile="default",
             scheduler="local_cwd",
             f=StringIO(),
             cfg=cfg,
@@ -250,9 +245,8 @@ def test_load_no_section(self) -> None:
         self.assertFalse(cfg.cfgs)
 
         load(
-            profile="default",
             scheduler="local_cwd",
-            f=StringIO("[default.scheduler_args.local_cwd]\n"),
+            f=StringIO("[scheduler_args.local_cwd]\n"),
             cfg=cfg,
         )
         # still empty
@@ -269,7 +263,7 @@ def test_dump_and_load_all_runopt_types(self, _) -> None:
         sfile.seek(0)
 
         cfg = RunConfig()
-        load(profile="default", scheduler="test", f=sfile, cfg=cfg)
+        load(scheduler="test", f=sfile, cfg=cfg)
 
         # all runopts in the TestScheduler have defaults, just check against those
         for opt_name, opt in TestScheduler().run_opts():
@@ -282,11 +276,11 @@ def test_dump_and_load_all_registered_schedulers(self) -> None:
 
         sfile = StringIO()
         dump(sfile)
-        print(sfile.getvalue())
+
         for sched_name, sched in get_schedulers(session_name="_").items():
             sfile.seek(0)  # reset the file pos
             cfg = RunConfig()
-            load(profile="default", scheduler=sched_name, f=sfile, cfg=cfg)
+            load(scheduler=sched_name, f=sfile, cfg=cfg)
 
             for opt_name, _ in sched.run_opts():
                 self.assertTrue(opt_name in cfg.cfgs)

From f00df91211ecbddedeff56ce87f39de488df2ea1 Mon Sep 17 00:00:00 2001
From: Tristan Rice <rice@fn.lc>
Date: Fri, 15 Oct 2021 13:28:26 -0700
Subject: [PATCH 10/14] ci/docpush: add missing dependency (#264)

Summary:
This adds the missing `pandoc` dependency to the docpush CI step.

Pull Request resolved: https://github.com/pytorch/torchx/pull/264

Test Plan:
Test with docpush manually enabled on the PR
https://github.com/pytorch/torchx/pull/264/checks?check_run_id=3908761230

Reviewed By: aivanou

Differential Revision: D31692193

Pulled By: d4l3k

fbshipit-source-id: 0fcb9b5667ec096d458d4e293c0cd1b34d402f7d
---
 .github/workflows/doc-build.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/doc-build.yaml b/.github/workflows/doc-build.yaml
index 122ae4d4a..b84bb1780 100644
--- a/.github/workflows/doc-build.yaml
+++ b/.github/workflows/doc-build.yaml
@@ -60,6 +60,10 @@ jobs:
           set -ex
           git config --global user.email "runner@github.com"
           git config --global user.name "TorchX CI Runner"
+      - name: Install Dependencies
+        run: |
+          set -eux
+          sudo apt-get install -y pandoc
       - name: Build
         run: |
           set -ex

From 6236614a4ef826eddec3534212ff150946e857e3 Mon Sep 17 00:00:00 2001
From: Tristan Rice <rice@fn.lc>
Date: Fri, 15 Oct 2021 13:32:31 -0700
Subject: [PATCH 11/14] ci/slurm: use ec2 instance connect + mssh instead of
 using SSH keys (#265)

Summary:
This switches the integration tests to use ec2 instance connect w/ an assumed role instead of embedding the slurm ssh key in GitHub secrets.

Pull Request resolved: https://github.com/pytorch/torchx/pull/265

Test Plan:
```
$ env SLURM_INSTANCE_MASTER=ubuntu@i-01dd4b95724eb0b4b scripts/slurmint.sh
```

CI

Reviewed By: kiukchung, aivanou

Differential Revision: D31695261

Pulled By: d4l3k

fbshipit-source-id: 48a52e911e68bc9b18ed470a5f7e725ff58697b1
---
 .../workflows/slurm-integration-tests.yaml    | 26 ++++++++++++++-----
 scripts/slurmint.sh                           |  8 +++---
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/slurm-integration-tests.yaml b/.github/workflows/slurm-integration-tests.yaml
index cbdd61dd7..98c79b740 100644
--- a/.github/workflows/slurm-integration-tests.yaml
+++ b/.github/workflows/slurm-integration-tests.yaml
@@ -9,6 +9,9 @@ on:
 jobs:
   slurm:
     runs-on: ubuntu-18.04
+    permissions:
+      id-token: write
+      contents: read
     steps:
       - name: Setup Python
         uses: actions/setup-python@v2
@@ -17,21 +20,32 @@ jobs:
           architecture: x64
       - name: Checkout TorchX
         uses: actions/checkout@v2
+      - name: Configure AWS
+        env:
+          AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }}
+        run: |
+          if [ -n "$AWS_ROLE_ARN" ]; then
+            export AWS_WEB_IDENTITY_TOKEN_FILE=/tmp/awscreds
+            export AWS_DEFAULT_REGION=us-west-2
+
+            echo AWS_WEB_IDENTITY_TOKEN_FILE=$AWS_WEB_IDENTITY_TOKEN_FILE >> $GITHUB_ENV
+            echo AWS_ROLE_ARN=$AWS_ROLE_ARN >> $GITHUB_ENV
+            echo AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION >> $GITHUB_ENV
+
+            curl -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" "$ACTIONS_ID_TOKEN_REQUEST_URL" | jq -r '.value' > $AWS_WEB_IDENTITY_TOKEN_FILE
+          fi
       - name: Install Dependencies
         run:
           set -ex
 
-          pip install wheel
+          pip install wheel ec2instanceconnectcli
       - name: Run Slurm Integration Tests
         env:
-          SLURM_SSH: ${{ secrets.SLURM_SSH }}
-          SLURM_MASTER: ${{ secrets.SLURM_MASTER }}
+          SLURM_INSTANCE_MASTER: ${{ secrets.SLURM_INSTANCE_MASTER }}
           SLURM_KNOWN_HOST: ${{ secrets.SLURM_KNOWN_HOST }}
-          SLURM_IDENT: id_rsa
         run: |
           set -e
-          echo "$SLURM_SSH" > "$SLURM_IDENT"
-          chmod 600 "$SLURM_IDENT"
+
           mkdir -p ~/.ssh
           echo "$SLURM_KNOWN_HOST" >> ~/.ssh/known_hosts
 
diff --git a/scripts/slurmint.sh b/scripts/slurmint.sh
index 136ff1597..6a9bd68e7 100755
--- a/scripts/slurmint.sh
+++ b/scripts/slurmint.sh
@@ -14,8 +14,8 @@ python setup.py bdist_wheel
 
 WHEEL="$DIST/$(ls $DIST)"
 
-if [[ -z "${SLURM_MASTER}" ]]; then
-    echo "slurm master is not set, skipping test..."
+if [[ -z "${SLURM_INSTANCE_MASTER}" ]]; then
+    echo "SLURM_INSTANCE_MASTER is not set, skipping test..."
     exit 0
 fi
 
@@ -25,11 +25,11 @@ VENV="$DIR/venv"
 
 function run_cmd {
     # shellcheck disable=SC2048,SC2086
-    ssh -o ServerAliveInterval=60 "$SLURM_MASTER" -i "$SLURM_IDENT" $*
+    mssh -o ServerAliveInterval=60 "$SLURM_INSTANCE_MASTER" -- $*
 }
 
 function run_scp {
-    scp -i "$SLURM_IDENT" "$1" "$SLURM_MASTER:$2"
+    rsync -rav -e mssh "$1" "$SLURM_INSTANCE_MASTER:$2"
 }
 
 function cleanup {

From af8114704751fea2d09a1b27824fa6de6481fb75 Mon Sep 17 00:00:00 2001
From: Aliaksandr Ivanou <aivanou@fb.com>
Date: Thu, 14 Oct 2021 11:36:21 -0700
Subject: [PATCH 12/14] Add interpret docs to example component, remove `test`
 arg from cv trainer

---
 torchx/components/interpret.py                |   2 +-
 .../apps/lightning_classy_vision/component.py | 106 ++++++++++--------
 .../apps/lightning_classy_vision/data.py      |   2 +-
 .../apps/lightning_classy_vision/interpret.py |  12 +-
 .../test/train_test.py                        |  29 +++++
 .../apps/lightning_classy_vision/train.py     |   9 +-
 6 files changed, 101 insertions(+), 59 deletions(-)
 create mode 100644 torchx/examples/apps/lightning_classy_vision/test/train_test.py

diff --git a/torchx/components/interpret.py b/torchx/components/interpret.py
index 6b10c5e29..178da14e9 100644
--- a/torchx/components/interpret.py
+++ b/torchx/components/interpret.py
@@ -16,6 +16,6 @@
 See the
 :ref:`examples_apps/lightning_classy_vision/interpret:Model Interpretability App Example`
 and the corresponding
-:ref:`Interpret component definition<examples_apps/lightning_classy_vision/component:Trainer Component Examples>`
+:ref:`Interpret component definition<examples_apps/lightning_classy_vision/component:Model Interpret>`
 for an example of how to use Captum with TorchX.
 """
diff --git a/torchx/examples/apps/lightning_classy_vision/component.py b/torchx/examples/apps/lightning_classy_vision/component.py
index 820777ae9..ba886d47d 100644
--- a/torchx/examples/apps/lightning_classy_vision/component.py
+++ b/torchx/examples/apps/lightning_classy_vision/component.py
@@ -46,24 +46,24 @@
 #
 #    torchx run --scheduler local_cwd \
 #    ./torchx/examples/apps/lightning_classy_vision/component.py:trainer \
-#    --output_path /tmp
+#    --output_path /tmp/$USER
 #
 # Single trainer component code:
 
 
 def trainer(
-    output_path: str,
-    image: str = TORCHX_IMAGE,
-    data_path: Optional[str] = None,
-    load_path: str = "",
-    log_path: str = "/tmp/logs",
-    resource: Optional[str] = None,
-    env: Optional[Dict[str, str]] = None,
-    skip_export: bool = False,
-    epochs: int = 1,
-    layers: Optional[List[int]] = None,
-    learning_rate: Optional[float] = None,
-    num_samples: int = 200,
+        output_path: str,
+        image: str = TORCHX_IMAGE,
+        data_path: Optional[str] = None,
+        load_path: str = "",
+        log_path: str = "/tmp/logs",
+        resource: Optional[str] = None,
+        env: Optional[Dict[str, str]] = None,
+        skip_export: bool = False,
+        epochs: int = 1,
+        layers: Optional[List[int]] = None,
+        learning_rate: Optional[float] = None,
+        num_samples: int = 200,
 ) -> torchx.AppDef:
     """Runs the example lightning_classy_vision app.
 
@@ -170,19 +170,19 @@ def trainer(
 
 
 def trainer_dist(
-    output_path: str,
-    image: str = TORCHX_IMAGE,
-    data_path: Optional[str] = None,
-    load_path: str = "",
-    log_path: str = "/tmp/logs",
-    resource: Optional[str] = None,
-    env: Optional[Dict[str, str]] = None,
-    skip_export: bool = False,
-    epochs: int = 1,
-    nnodes: int = 1,
-    nproc_per_node: int = 1,
-    rdzv_backend: str = "etcd",
-    rdzv_endpoint: str = "etcd-server:2379",
+        output_path: str,
+        image: str = TORCHX_IMAGE,
+        data_path: Optional[str] = None,
+        load_path: str = "",
+        log_path: str = "/tmp/logs",
+        resource: Optional[str] = None,
+        env: Optional[Dict[str, str]] = None,
+        skip_export: bool = False,
+        epochs: int = 1,
+        nnodes: int = 1,
+        nproc_per_node: int = 1,
+        rdzv_backend: str = "etcd",
+        rdzv_endpoint: str = "etcd-server:2379",
 ) -> torchx.AppDef:
     """Runs the example lightning_classy_vision app.
 
@@ -258,44 +258,58 @@ def trainer_dist(
 
 
 # %%
-# Model Interpretability
-# #######################
-# TODO(aivanou): add documentation
+# Model Interpret
+# #################
+# Defines interpret component
+#
+# Train a single trainer example: :ref:`examples_apps/lightning_classy_vision/component:Single Trainer Component`
+# And use the following cmd to try out:
+#
+# .. code:: bash
+#
+#    torchx run --scheduler local_cwd \
+#    ./torchx/examples/apps/lightning_classy_vision/component.py:interpret \
+#    --output_path /tmp/aivanou/interpret  --load_path /tmp/$USER/last.ckpt
 
 
 def interpret(
-    image: str,
-    load_path: str,
-    data_path: str,
-    output_path: str,
-    resource: Optional[str] = None,
+        load_path: str,
+        output_path: str,
+        data_path: Optional[str] = None,
+        image: str = TORCHX_IMAGE,
+        resource: Optional[str] = None,
 ) -> torchx.AppDef:
     """Runs the model interpretability app on the model outputted by the training
     component.
 
     Args:
-        image: image to run (e.g. foobar:latest)
         load_path: path to load pretrained model from
-        data_path: path to the data to load
         output_path: output path for model checkpoints (e.g. file:///foo/bar)
+        data_path: path to the data to load
+        image: image to run (e.g. foobar:latest)
         resource: the resources to use
     """
+    args = [
+        "-m",
+        "torchx.examples.apps.lightning_classy_vision.interpret",
+        "--load_path",
+        load_path,
+        "--output_path",
+        output_path,
+    ]
+    if data_path:
+        args += [
+            "--data_path",
+            data_path,
+        ]
+
     return torchx.AppDef(
         name="cv-interpret",
         roles=[
             torchx.Role(
                 name="worker",
                 entrypoint="python",
-                args=[
-                    "-m",
-                    "torchx.examples.apps.lightning_classy_vision.interpret",
-                    "--load_path",
-                    load_path,
-                    "--data_path",
-                    data_path,
-                    "--output_path",
-                    output_path,
-                ],
+                args=args,
                 image=image,
                 resource=named_resources[resource]
                 if resource
diff --git a/torchx/examples/apps/lightning_classy_vision/data.py b/torchx/examples/apps/lightning_classy_vision/data.py
index 56ceb9e6a..68aeced43 100644
--- a/torchx/examples/apps/lightning_classy_vision/data.py
+++ b/torchx/examples/apps/lightning_classy_vision/data.py
@@ -148,7 +148,7 @@ def download_data(remote_path: str, tmpdir: str) -> str:
     return data_path
 
 
-def create_random_data(output_path: str, num_images: int = 5) -> None:
+def create_random_data(output_path: str, num_images: int = 250) -> None:
     """
     Fills the given path with randomly generated 64x64 images.
     This can be used for quick testing of the workflow of the model.
diff --git a/torchx/examples/apps/lightning_classy_vision/interpret.py b/torchx/examples/apps/lightning_classy_vision/interpret.py
index 7f709e2d2..84cd653e4 100755
--- a/torchx/examples/apps/lightning_classy_vision/interpret.py
+++ b/torchx/examples/apps/lightning_classy_vision/interpret.py
@@ -35,10 +35,10 @@
 from torchx.examples.apps.lightning_classy_vision.data import (
     TinyImageNetDataModule,
     download_data,
+    create_random_data,
 )
 from torchx.examples.apps.lightning_classy_vision.model import TinyImageNetModel
 
-
 # FIXME: captum must be imported after torch otherwise it causes python to crash
 if True:
     import numpy as np
@@ -57,8 +57,7 @@ def parse_args(argv: List[str]) -> argparse.Namespace:
     parser.add_argument(
         "--data_path",
         type=str,
-        help="path to load the training data from",
-        required=True,
+        help="path to load the training data from, if not provided, random dataset will be created",
     )
     parser.add_argument(
         "--output_path",
@@ -91,7 +90,12 @@ def main(argv: List[str]) -> None:
         model.load_from_checkpoint(checkpoint_path=args.load_path)
 
         # Download and setup the data module
-        data_path = download_data(args.data_path, tmpdir)
+        if not args.data_path:
+            data_path = os.path.join(tmpdir, "data")
+            os.makedirs(data_path)
+            create_random_data(data_path)
+        else:
+            data_path = download_data(args.data_path, tmpdir)
         data = TinyImageNetDataModule(
             data_dir=data_path,
             batch_size=1,
diff --git a/torchx/examples/apps/lightning_classy_vision/test/train_test.py b/torchx/examples/apps/lightning_classy_vision/test/train_test.py
new file mode 100644
index 000000000..ba2e510b3
--- /dev/null
+++ b/torchx/examples/apps/lightning_classy_vision/test/train_test.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+import torchx.examples.apps.lightning_classy_vision.interpret as interpret
+
+
+class ModelTest(unittest.TestCase):
+    def test_basic(self) -> None:
+        model = TinyImageNetModel()
+        self.assertEqual(len(model.seq), 1)
+        out = model(torch.zeros((1, 64, 64)))
+        self.assertIsNotNone(out)
+
+    def test_layer_sizes(self) -> None:
+        model = TinyImageNetModel(
+            layer_sizes=[
+                10,
+                15,
+            ],
+        )
+        self.assertEqual(len(model.seq), 5)
+        out = model(torch.zeros((1, 64, 64)))
+        self.assertIsNotNone(out)
diff --git a/torchx/examples/apps/lightning_classy_vision/train.py b/torchx/examples/apps/lightning_classy_vision/train.py
index 57bfade7c..2d2a14d55 100755
--- a/torchx/examples/apps/lightning_classy_vision/train.py
+++ b/torchx/examples/apps/lightning_classy_vision/train.py
@@ -56,12 +56,7 @@ def parse_args(argv: List[str]) -> argparse.Namespace:
     parser.add_argument(
         "--batch_size", type=int, default=32, help="batch size to use for training"
     )
-    parser.add_argument("--num_samples", type=int, default=None, help="num_samples")
-    parser.add_argument(
-        "--test",
-        help="Sets to test mode, training on a much smaller set of randomly generated images",
-        action="store_true",
-    )
+    parser.add_argument("--num_samples", type=int, default=10, help="num_samples")
     parser.add_argument(
         "--data_path",
         type=str,
@@ -122,7 +117,7 @@ def main(argv: List[str]) -> None:
         data = TinyImageNetDataModule(
             data_dir=data_path,
             batch_size=args.batch_size,
-            num_samples=5 if args.test else args.num_samples,
+            num_samples=args.num_samples,
         )
 
         # Setup model checkpointing

From c18b7d68244adc0b7a91ea85efa0b31fcc858884 Mon Sep 17 00:00:00 2001
From: Aliaksandr Ivanou <aivanou@fb.com>
Date: Thu, 14 Oct 2021 11:40:34 -0700
Subject: [PATCH 13/14] Resolve lint errors

---
 .../apps/lightning_classy_vision/component.py | 60 +++++++++----------
 .../test/train_test.py                        | 29 ---------
 2 files changed, 30 insertions(+), 59 deletions(-)
 delete mode 100644 torchx/examples/apps/lightning_classy_vision/test/train_test.py

diff --git a/torchx/examples/apps/lightning_classy_vision/component.py b/torchx/examples/apps/lightning_classy_vision/component.py
index ba886d47d..4e069e066 100644
--- a/torchx/examples/apps/lightning_classy_vision/component.py
+++ b/torchx/examples/apps/lightning_classy_vision/component.py
@@ -52,18 +52,18 @@
 
 
 def trainer(
-        output_path: str,
-        image: str = TORCHX_IMAGE,
-        data_path: Optional[str] = None,
-        load_path: str = "",
-        log_path: str = "/tmp/logs",
-        resource: Optional[str] = None,
-        env: Optional[Dict[str, str]] = None,
-        skip_export: bool = False,
-        epochs: int = 1,
-        layers: Optional[List[int]] = None,
-        learning_rate: Optional[float] = None,
-        num_samples: int = 200,
+    output_path: str,
+    image: str = TORCHX_IMAGE,
+    data_path: Optional[str] = None,
+    load_path: str = "",
+    log_path: str = "/tmp/logs",
+    resource: Optional[str] = None,
+    env: Optional[Dict[str, str]] = None,
+    skip_export: bool = False,
+    epochs: int = 1,
+    layers: Optional[List[int]] = None,
+    learning_rate: Optional[float] = None,
+    num_samples: int = 200,
 ) -> torchx.AppDef:
     """Runs the example lightning_classy_vision app.
 
@@ -170,19 +170,19 @@ def trainer(
 
 
 def trainer_dist(
-        output_path: str,
-        image: str = TORCHX_IMAGE,
-        data_path: Optional[str] = None,
-        load_path: str = "",
-        log_path: str = "/tmp/logs",
-        resource: Optional[str] = None,
-        env: Optional[Dict[str, str]] = None,
-        skip_export: bool = False,
-        epochs: int = 1,
-        nnodes: int = 1,
-        nproc_per_node: int = 1,
-        rdzv_backend: str = "etcd",
-        rdzv_endpoint: str = "etcd-server:2379",
+    output_path: str,
+    image: str = TORCHX_IMAGE,
+    data_path: Optional[str] = None,
+    load_path: str = "",
+    log_path: str = "/tmp/logs",
+    resource: Optional[str] = None,
+    env: Optional[Dict[str, str]] = None,
+    skip_export: bool = False,
+    epochs: int = 1,
+    nnodes: int = 1,
+    nproc_per_node: int = 1,
+    rdzv_backend: str = "etcd",
+    rdzv_endpoint: str = "etcd-server:2379",
 ) -> torchx.AppDef:
     """Runs the example lightning_classy_vision app.
 
@@ -273,11 +273,11 @@ def trainer_dist(
 
 
 def interpret(
-        load_path: str,
-        output_path: str,
-        data_path: Optional[str] = None,
-        image: str = TORCHX_IMAGE,
-        resource: Optional[str] = None,
+    load_path: str,
+    output_path: str,
+    data_path: Optional[str] = None,
+    image: str = TORCHX_IMAGE,
+    resource: Optional[str] = None,
 ) -> torchx.AppDef:
     """Runs the model interpretability app on the model outputted by the training
     component.
diff --git a/torchx/examples/apps/lightning_classy_vision/test/train_test.py b/torchx/examples/apps/lightning_classy_vision/test/train_test.py
deleted file mode 100644
index ba2e510b3..000000000
--- a/torchx/examples/apps/lightning_classy_vision/test/train_test.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-import torch
-import torchx.examples.apps.lightning_classy_vision.interpret as interpret
-
-
-class ModelTest(unittest.TestCase):
-    def test_basic(self) -> None:
-        model = TinyImageNetModel()
-        self.assertEqual(len(model.seq), 1)
-        out = model(torch.zeros((1, 64, 64)))
-        self.assertIsNotNone(out)
-
-    def test_layer_sizes(self) -> None:
-        model = TinyImageNetModel(
-            layer_sizes=[
-                10,
-                15,
-            ],
-        )
-        self.assertEqual(len(model.seq), 5)
-        out = model(torch.zeros((1, 64, 64)))
-        self.assertIsNotNone(out)

From 794e0c93e7d73f28865f78e0f9522debc665d06f Mon Sep 17 00:00:00 2001
From: Aliaksandr Ivanou <aivanou@fb.com>
Date: Thu, 14 Oct 2021 18:40:55 -0700
Subject: [PATCH 14/14] Addressed comments

---
 torchx/components/interpret.py                            | 2 +-
 torchx/examples/apps/lightning_classy_vision/component.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchx/components/interpret.py b/torchx/components/interpret.py
index 178da14e9..030185457 100644
--- a/torchx/components/interpret.py
+++ b/torchx/components/interpret.py
@@ -16,6 +16,6 @@
 See the
 :ref:`examples_apps/lightning_classy_vision/interpret:Model Interpretability App Example`
 and the corresponding
-:ref:`Interpret component definition<examples_apps/lightning_classy_vision/component:Model Interpret>`
+:ref:`Interpret component definition<examples_apps/lightning_classy_vision/component:Interpreting the Model>`
 for an example of how to use Captum with TorchX.
 """
diff --git a/torchx/examples/apps/lightning_classy_vision/component.py b/torchx/examples/apps/lightning_classy_vision/component.py
index 4e069e066..300f63e10 100644
--- a/torchx/examples/apps/lightning_classy_vision/component.py
+++ b/torchx/examples/apps/lightning_classy_vision/component.py
@@ -258,9 +258,9 @@ def trainer_dist(
 
 
 # %%
-# Model Interpret
-# #################
-# Defines interpret component
+# Interpreting the Model
+# #######################
+# Defines a component that interprets the model
 #
 # Train a single trainer example: :ref:`examples_apps/lightning_classy_vision/component:Single Trainer Component`
 # And use the following cmd to try out: