From 9367151634325f42c2d9cf2e865f117399ad68b4 Mon Sep 17 00:00:00 2001 From: Aliaksandr Ivanou Date: Thu, 14 Oct 2021 11:36:21 -0700 Subject: [PATCH 01/14] Add interpret docs to example component, remove `test` arg from cv trainer --- torchx/components/interpret.py | 2 +- .../apps/lightning_classy_vision/component.py | 106 ++++++++++-------- .../apps/lightning_classy_vision/data.py | 2 +- .../apps/lightning_classy_vision/interpret.py | 12 +- .../test/train_test.py | 29 +++++ .../apps/lightning_classy_vision/train.py | 9 +- 6 files changed, 101 insertions(+), 59 deletions(-) create mode 100644 torchx/examples/apps/lightning_classy_vision/test/train_test.py diff --git a/torchx/components/interpret.py b/torchx/components/interpret.py index 6b10c5e29..178da14e9 100644 --- a/torchx/components/interpret.py +++ b/torchx/components/interpret.py @@ -16,6 +16,6 @@ See the :ref:`examples_apps/lightning_classy_vision/interpret:Model Interpretability App Example` and the corresponding -:ref:`Interpret component definition` +:ref:`Interpret component definition` for an example of how to use Captum with TorchX. """ diff --git a/torchx/examples/apps/lightning_classy_vision/component.py b/torchx/examples/apps/lightning_classy_vision/component.py index 820777ae9..ba886d47d 100644 --- a/torchx/examples/apps/lightning_classy_vision/component.py +++ b/torchx/examples/apps/lightning_classy_vision/component.py @@ -46,24 +46,24 @@ # # torchx run --scheduler local_cwd \ # ./torchx/examples/apps/lightning_classy_vision/component.py:trainer \ -# --output_path /tmp +# --output_path /tmp/$USER # # Single trainer component code: def trainer( - output_path: str, - image: str = TORCHX_IMAGE, - data_path: Optional[str] = None, - load_path: str = "", - log_path: str = "/tmp/logs", - resource: Optional[str] = None, - env: Optional[Dict[str, str]] = None, - skip_export: bool = False, - epochs: int = 1, - layers: Optional[List[int]] = None, - learning_rate: Optional[float] = None, - num_samples: int = 200, + output_path: str, + image: str = TORCHX_IMAGE, + data_path: Optional[str] = None, + load_path: str = "", + log_path: str = "/tmp/logs", + resource: Optional[str] = None, + env: Optional[Dict[str, str]] = None, + skip_export: bool = False, + epochs: int = 1, + layers: Optional[List[int]] = None, + learning_rate: Optional[float] = None, + num_samples: int = 200, ) -> torchx.AppDef: """Runs the example lightning_classy_vision app. @@ -170,19 +170,19 @@ def trainer( def trainer_dist( - output_path: str, - image: str = TORCHX_IMAGE, - data_path: Optional[str] = None, - load_path: str = "", - log_path: str = "/tmp/logs", - resource: Optional[str] = None, - env: Optional[Dict[str, str]] = None, - skip_export: bool = False, - epochs: int = 1, - nnodes: int = 1, - nproc_per_node: int = 1, - rdzv_backend: str = "etcd", - rdzv_endpoint: str = "etcd-server:2379", + output_path: str, + image: str = TORCHX_IMAGE, + data_path: Optional[str] = None, + load_path: str = "", + log_path: str = "/tmp/logs", + resource: Optional[str] = None, + env: Optional[Dict[str, str]] = None, + skip_export: bool = False, + epochs: int = 1, + nnodes: int = 1, + nproc_per_node: int = 1, + rdzv_backend: str = "etcd", + rdzv_endpoint: str = "etcd-server:2379", ) -> torchx.AppDef: """Runs the example lightning_classy_vision app. @@ -258,44 +258,58 @@ def trainer_dist( # %% -# Model Interpretability -# ####################### -# TODO(aivanou): add documentation +# Model Interpret +# ################# +# Defines interpret component +# +# Train a single trainer example: :ref:`examples_apps/lightning_classy_vision/component:Single Trainer Component` +# And use the following cmd to try out: +# +# .. code:: bash +# +# torchx run --scheduler local_cwd \ +# ./torchx/examples/apps/lightning_classy_vision/component.py:interpret \ +# --output_path /tmp/aivanou/interpret --load_path /tmp/$USER/last.ckpt def interpret( - image: str, - load_path: str, - data_path: str, - output_path: str, - resource: Optional[str] = None, + load_path: str, + output_path: str, + data_path: Optional[str] = None, + image: str = TORCHX_IMAGE, + resource: Optional[str] = None, ) -> torchx.AppDef: """Runs the model interpretability app on the model outputted by the training component. Args: - image: image to run (e.g. foobar:latest) load_path: path to load pretrained model from - data_path: path to the data to load output_path: output path for model checkpoints (e.g. file:///foo/bar) + data_path: path to the data to load + image: image to run (e.g. foobar:latest) resource: the resources to use """ + args = [ + "-m", + "torchx.examples.apps.lightning_classy_vision.interpret", + "--load_path", + load_path, + "--output_path", + output_path, + ] + if data_path: + args += [ + "--data_path", + data_path, + ] + return torchx.AppDef( name="cv-interpret", roles=[ torchx.Role( name="worker", entrypoint="python", - args=[ - "-m", - "torchx.examples.apps.lightning_classy_vision.interpret", - "--load_path", - load_path, - "--data_path", - data_path, - "--output_path", - output_path, - ], + args=args, image=image, resource=named_resources[resource] if resource diff --git a/torchx/examples/apps/lightning_classy_vision/data.py b/torchx/examples/apps/lightning_classy_vision/data.py index 56ceb9e6a..68aeced43 100644 --- a/torchx/examples/apps/lightning_classy_vision/data.py +++ b/torchx/examples/apps/lightning_classy_vision/data.py @@ -148,7 +148,7 @@ def download_data(remote_path: str, tmpdir: str) -> str: return data_path -def create_random_data(output_path: str, num_images: int = 5) -> None: +def create_random_data(output_path: str, num_images: int = 250) -> None: """ Fills the given path with randomly generated 64x64 images. This can be used for quick testing of the workflow of the model. diff --git a/torchx/examples/apps/lightning_classy_vision/interpret.py b/torchx/examples/apps/lightning_classy_vision/interpret.py index 7f709e2d2..84cd653e4 100755 --- a/torchx/examples/apps/lightning_classy_vision/interpret.py +++ b/torchx/examples/apps/lightning_classy_vision/interpret.py @@ -35,10 +35,10 @@ from torchx.examples.apps.lightning_classy_vision.data import ( TinyImageNetDataModule, download_data, + create_random_data, ) from torchx.examples.apps.lightning_classy_vision.model import TinyImageNetModel - # FIXME: captum must be imported after torch otherwise it causes python to crash if True: import numpy as np @@ -57,8 +57,7 @@ def parse_args(argv: List[str]) -> argparse.Namespace: parser.add_argument( "--data_path", type=str, - help="path to load the training data from", - required=True, + help="path to load the training data from, if not provided, random dataset will be created", ) parser.add_argument( "--output_path", @@ -91,7 +90,12 @@ def main(argv: List[str]) -> None: model.load_from_checkpoint(checkpoint_path=args.load_path) # Download and setup the data module - data_path = download_data(args.data_path, tmpdir) + if not args.data_path: + data_path = os.path.join(tmpdir, "data") + os.makedirs(data_path) + create_random_data(data_path) + else: + data_path = download_data(args.data_path, tmpdir) data = TinyImageNetDataModule( data_dir=data_path, batch_size=1, diff --git a/torchx/examples/apps/lightning_classy_vision/test/train_test.py b/torchx/examples/apps/lightning_classy_vision/test/train_test.py new file mode 100644 index 000000000..ba2e510b3 --- /dev/null +++ b/torchx/examples/apps/lightning_classy_vision/test/train_test.py @@ -0,0 +1,29 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +import torchx.examples.apps.lightning_classy_vision.interpret as interpret + + +class ModelTest(unittest.TestCase): + def test_basic(self) -> None: + model = TinyImageNetModel() + self.assertEqual(len(model.seq), 1) + out = model(torch.zeros((1, 64, 64))) + self.assertIsNotNone(out) + + def test_layer_sizes(self) -> None: + model = TinyImageNetModel( + layer_sizes=[ + 10, + 15, + ], + ) + self.assertEqual(len(model.seq), 5) + out = model(torch.zeros((1, 64, 64))) + self.assertIsNotNone(out) diff --git a/torchx/examples/apps/lightning_classy_vision/train.py b/torchx/examples/apps/lightning_classy_vision/train.py index 0db67f097..309fe7adf 100755 --- a/torchx/examples/apps/lightning_classy_vision/train.py +++ b/torchx/examples/apps/lightning_classy_vision/train.py @@ -56,12 +56,7 @@ def parse_args(argv: List[str]) -> argparse.Namespace: parser.add_argument( "--batch_size", type=int, default=32, help="batch size to use for training" ) - parser.add_argument("--num_samples", type=int, default=None, help="num_samples") - parser.add_argument( - "--test", - help="Sets to test mode, training on a much smaller set of randomly generated images", - action="store_true", - ) + parser.add_argument("--num_samples", type=int, default=10, help="num_samples") parser.add_argument( "--data_path", type=str, @@ -113,7 +108,7 @@ def main(argv: List[str]) -> None: data = TinyImageNetDataModule( data_dir=data_path, batch_size=args.batch_size, - num_samples=5 if args.test else args.num_samples, + num_samples=args.num_samples, ) # Setup model checkpointing From 8f798bc0e5400dd34cf898d4c3b08ae39089c3ee Mon Sep 17 00:00:00 2001 From: Aliaksandr Ivanou Date: Thu, 14 Oct 2021 11:40:34 -0700 Subject: [PATCH 02/14] Resolve lint errors --- .../apps/lightning_classy_vision/component.py | 60 +++++++++---------- .../test/train_test.py | 29 --------- 2 files changed, 30 insertions(+), 59 deletions(-) delete mode 100644 torchx/examples/apps/lightning_classy_vision/test/train_test.py diff --git a/torchx/examples/apps/lightning_classy_vision/component.py b/torchx/examples/apps/lightning_classy_vision/component.py index ba886d47d..4e069e066 100644 --- a/torchx/examples/apps/lightning_classy_vision/component.py +++ b/torchx/examples/apps/lightning_classy_vision/component.py @@ -52,18 +52,18 @@ def trainer( - output_path: str, - image: str = TORCHX_IMAGE, - data_path: Optional[str] = None, - load_path: str = "", - log_path: str = "/tmp/logs", - resource: Optional[str] = None, - env: Optional[Dict[str, str]] = None, - skip_export: bool = False, - epochs: int = 1, - layers: Optional[List[int]] = None, - learning_rate: Optional[float] = None, - num_samples: int = 200, + output_path: str, + image: str = TORCHX_IMAGE, + data_path: Optional[str] = None, + load_path: str = "", + log_path: str = "/tmp/logs", + resource: Optional[str] = None, + env: Optional[Dict[str, str]] = None, + skip_export: bool = False, + epochs: int = 1, + layers: Optional[List[int]] = None, + learning_rate: Optional[float] = None, + num_samples: int = 200, ) -> torchx.AppDef: """Runs the example lightning_classy_vision app. @@ -170,19 +170,19 @@ def trainer( def trainer_dist( - output_path: str, - image: str = TORCHX_IMAGE, - data_path: Optional[str] = None, - load_path: str = "", - log_path: str = "/tmp/logs", - resource: Optional[str] = None, - env: Optional[Dict[str, str]] = None, - skip_export: bool = False, - epochs: int = 1, - nnodes: int = 1, - nproc_per_node: int = 1, - rdzv_backend: str = "etcd", - rdzv_endpoint: str = "etcd-server:2379", + output_path: str, + image: str = TORCHX_IMAGE, + data_path: Optional[str] = None, + load_path: str = "", + log_path: str = "/tmp/logs", + resource: Optional[str] = None, + env: Optional[Dict[str, str]] = None, + skip_export: bool = False, + epochs: int = 1, + nnodes: int = 1, + nproc_per_node: int = 1, + rdzv_backend: str = "etcd", + rdzv_endpoint: str = "etcd-server:2379", ) -> torchx.AppDef: """Runs the example lightning_classy_vision app. @@ -273,11 +273,11 @@ def trainer_dist( def interpret( - load_path: str, - output_path: str, - data_path: Optional[str] = None, - image: str = TORCHX_IMAGE, - resource: Optional[str] = None, + load_path: str, + output_path: str, + data_path: Optional[str] = None, + image: str = TORCHX_IMAGE, + resource: Optional[str] = None, ) -> torchx.AppDef: """Runs the model interpretability app on the model outputted by the training component. diff --git a/torchx/examples/apps/lightning_classy_vision/test/train_test.py b/torchx/examples/apps/lightning_classy_vision/test/train_test.py deleted file mode 100644 index ba2e510b3..000000000 --- a/torchx/examples/apps/lightning_classy_vision/test/train_test.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import unittest - -import torch -import torchx.examples.apps.lightning_classy_vision.interpret as interpret - - -class ModelTest(unittest.TestCase): - def test_basic(self) -> None: - model = TinyImageNetModel() - self.assertEqual(len(model.seq), 1) - out = model(torch.zeros((1, 64, 64))) - self.assertIsNotNone(out) - - def test_layer_sizes(self) -> None: - model = TinyImageNetModel( - layer_sizes=[ - 10, - 15, - ], - ) - self.assertEqual(len(model.seq), 5) - out = model(torch.zeros((1, 64, 64))) - self.assertIsNotNone(out) From 873f5092310f86cf8f0f1a24e9e80aa8d15833fa Mon Sep 17 00:00:00 2001 From: Kiuk Chung Date: Thu, 14 Oct 2021 12:13:10 -0700 Subject: [PATCH 03/14] (torchx/runner) standardize class and param naming around runopts and runcfg (#252) Summary: Pull Request resolved: https://github.com/pytorch/torchx/pull/252 closes: https://github.com/pytorch/torchx/issues/250 I've done a few things on this diff: 1. Renamed `torchx.specs.api.Runopt` to `torchx.specs.runopt` (for consistency with `runopts`) 2. Renamed variables (where I could) `runcfg` to `cfg` (to be consistent with the scheduler and runner apis) 3. Renamed the config section to `[$profile.$sched.cfg]` instead of `[$profile.scheduler_args.$sched]` 4. Changed the torchx run cli's `-a` (short for `--scheduler_args`) to `-cfg` for consistency with the rest of the system. Reviewed By: aivanou Differential Revision: D31656766 fbshipit-source-id: 8c009852d5807010ac4cd33902b294cff4bd0ec1 --- torchx/cli/cmd_run.py | 14 ++--- torchx/runner/api.py | 2 +- torchx/runner/config.py | 52 ++++++++++------ torchx/runner/test/config_test.py | 100 +++++++++++++++--------------- torchx/specs/__init__.py | 38 ++++++------ torchx/specs/api.py | 21 +++---- 6 files changed, 119 insertions(+), 108 deletions(-) diff --git a/torchx/cli/cmd_run.py b/torchx/cli/cmd_run.py index c72e10f61..1d1189db2 100644 --- a/torchx/cli/cmd_run.py +++ b/torchx/cli/cmd_run.py @@ -10,18 +10,18 @@ import sys from dataclasses import asdict from pprint import pformat -from typing import Dict, List, cast, Type, Optional +from typing import Dict, List, Optional, Type, cast import torchx.specs as specs from pyre_extensions import none_throws from torchx.cli.cmd_base import SubCommand from torchx.runner import Runner, get_runner -from torchx.schedulers import get_scheduler_factories, get_default_scheduler_name +from torchx.schedulers import get_default_scheduler_name, get_scheduler_factories from torchx.specs.finder import ( + ComponentNotFoundException, + ComponentValidationException, _Component, get_components, - ComponentValidationException, - ComponentNotFoundException, ) from torchx.util.types import to_dict @@ -41,13 +41,13 @@ def _convert_to_option_type( return option_type(value) -def _parse_run_config(arg: str, scheduler_run_opts: specs.runopts) -> specs.RunConfig: +def _parse_run_config(arg: str, scheduler_opts: specs.runopts) -> specs.RunConfig: conf = specs.RunConfig() if not arg: return conf for key, value in to_dict(arg).items(): - option = scheduler_run_opts.get(key) + option = scheduler_opts.get(key) if option is None: raise ValueError(f"Unknown {key}, run `torchx runopts` for more info") option_type = option.opt_type @@ -86,7 +86,7 @@ def add_arguments(self, subparser: argparse.ArgumentParser) -> None: default=get_default_scheduler_name(), ) subparser.add_argument( - "-a", + "-cfg", "--scheduler_args", type=str, help="Arguments to pass to the scheduler (Ex:`cluster=foo,user=bar`)." diff --git a/torchx/runner/api.py b/torchx/runner/api.py index 89fe009bb..4da92d365 100644 --- a/torchx/runner/api.py +++ b/torchx/runner/api.py @@ -263,7 +263,7 @@ def dryrun( cfg = cfg or RunConfig() # TODO enable profiles - https://github.com/pytorch/torchx/issues/248 - config.apply(profile="default", scheduler=scheduler, runcfg=cfg) + config.apply(scheduler=scheduler, cfg=cfg, profile="default") sched = self._scheduler(scheduler) sched._validate(app, scheduler) diff --git a/torchx/runner/config.py b/torchx/runner/config.py index 40022e4d8..d68613555 100644 --- a/torchx/runner/config.py +++ b/torchx/runner/config.py @@ -12,6 +12,7 @@ from torchx.schedulers import Scheduler, get_schedulers from torchx.specs import RunConfig, get_type_name +from torchx.specs.api import runopt _NONE = "None" @@ -47,22 +48,36 @@ def _get_scheduler(name: str) -> Scheduler: return sched +def _fixme_placeholder(runopt: runopt, max_len: int = 60) -> str: + ph = f"#FIXME:({get_type_name(runopt.opt_type)}) {runopt.help}" + return ph if len(ph) <= max_len else f"{ph[:max_len]}..." + + def dump( f: TextIO, schedulers: Optional[List[str]] = None, required_only: bool = False ) -> None: """ - Dumps a default INI-style config template containing the runopts for the - given scheduler names into ``f``. If no ``schedulers`` are specified - dumps all known registered schedulers. + Dumps a default INI-style config template containing the :py:class:torchx.specs.runopts for the + given scheduler names into the file-like object specified by ``f``. + If no ``schedulers`` are specified dumps all known registered schedulers. Optional runopts are pre-filled with their default values. - Required runopts are set with a ```` placeholder. + Required runopts are set with a ``FIXME: ...`` placeholder. + To only dump required runopts pass ``required_only=True``. + Each scheduler's runopts are written in the section called - ``[default.scheduler_args.{scheduler_name}]`` (e.g. ``[default.scheduler_args.kubernetes]``) + ``[default.{scheduler_name}.cfg]``. - To only dump required runopts pass ``required_only=True``. + For example: + + :: - Raises a ``ValueError`` if given a scheduler name that is not known + [default.kubernetes.cfg] + namespace = default + queue = #FIXME (str)Volcano queue to schedule job in + + Raises: + ``ValueError`` - if given a scheduler name that is not known """ if schedulers: @@ -74,12 +89,12 @@ def dump( for sched_name in scheds: sched = _get_scheduler(sched_name) - section = f"default.scheduler_args.{sched_name}" + section = f"default.{sched_name}.cfg" config.add_section(section) for opt_name, opt in sched.run_opts(): if opt.is_required: - val = f"" + val = _fixme_placeholder(opt) else: # not required runopts MUST have a default if required_only: continue @@ -96,11 +111,10 @@ def dump( val = f"{opt.default}" config.set(section, opt_name, val) - config.write(f, space_around_delimiters=True) -def apply(profile: str, scheduler: str, runcfg: RunConfig) -> None: +def apply(scheduler: str, cfg: RunConfig, profile: str = "default") -> None: """ Loads .torchxconfig files from predefined locations according to a load hierarchy and applies the loaded configs into the @@ -121,10 +135,10 @@ def apply(profile: str, scheduler: str, runcfg: RunConfig) -> None: if configfile.exists(): log.info(f"loading configs from {configfile}") with open(str(configfile), "r") as f: - load(profile, scheduler, f, runcfg) + load(scheduler, f, cfg, profile) -def load(profile: str, scheduler: str, f: TextIO, runcfg: RunConfig) -> None: +def load(scheduler: str, f: TextIO, cfg: RunConfig, profile: str = "default") -> None: """ loads the section ``[{profile}.scheduler_args.{scheduler}]`` from the given configfile ``f`` (in .INI format) into the provided ``runcfg``, only adding @@ -137,17 +151,17 @@ def load(profile: str, scheduler: str, f: TextIO, runcfg: RunConfig) -> None: runopts = _get_scheduler(scheduler).run_opts() - section = f"{profile}.scheduler_args.{scheduler}" + section = f"{profile}.{scheduler}.cfg" if config.has_section(section): for name, value in config.items(section): - if name in runcfg.cfgs: + if name in cfg.cfgs: # DO NOT OVERRIDE existing configs continue if value == _NONE: # should map to None (not str 'None') # this also handles empty or None lists - runcfg.set(name, None) + cfg.set(name, None) else: runopt = runopts.get(name) @@ -161,9 +175,9 @@ def load(profile: str, scheduler: str, f: TextIO, runcfg: RunConfig) -> None: if runopt.opt_type is bool: # need to handle bool specially since str -> bool is based on # str emptiness not value (e.g. bool("False") == True) - runcfg.set(name, config.getboolean(section, name)) + cfg.set(name, config.getboolean(section, name)) elif runopt.opt_type is List[str]: - runcfg.set(name, value.split(";")) + cfg.set(name, value.split(";")) else: # pyre-ignore[29] - runcfg.set(name, runopt.opt_type(value)) + cfg.set(name, runopt.opt_type(value)) diff --git a/torchx/runner/test/config_test.py b/torchx/runner/test/config_test.py index b936366f2..ad34ba0cb 100644 --- a/torchx/runner/test/config_test.py +++ b/torchx/runner/test/config_test.py @@ -102,30 +102,30 @@ def run_opts(self) -> runopts: return opts -_CONFIG = """[default.scheduler_args.local_cwd] +_CONFIG = """[default.local_cwd.cfg] log_dir = /home/bob/logs prepend_cwd = True -[test.scheduler_args.local_cwd] +[test.local_cwd.cfg] log_dir = None prepend_cwd = False -[alpha.scheduler_args.local_cwd] +[alpha.local_cwd.cfg] log_dir = /tmp/logs """ -_CONFIG_INVALID = """[default.scheduler_args.test] +_CONFIG_INVALID = """[default.test.cfg] a_run_opt_that = does_not_exist s = option_that_exists """ -_TEAM_CONFIG = """[default.scheduler_args.test] +_TEAM_CONFIG = """[default.test.cfg] s = team_default i = 50 f = 1.2 """ -_MY_CONFIG = """[default.scheduler_args.test] +_MY_CONFIG = """[default.test.cfg] s = my_default i = 100 """ @@ -158,32 +158,30 @@ def _write(self, filename: str, content: str) -> Path: return f def test_load(self) -> None: - runcfg = RunConfig() - load( - profile="default", scheduler="local_cwd", f=StringIO(_CONFIG), runcfg=runcfg - ) - self.assertEqual("/home/bob/logs", runcfg.get("log_dir")) - self.assertEqual(True, runcfg.get("prepend_cwd")) + cfg = RunConfig() + load(profile="default", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg) + self.assertEqual("/home/bob/logs", cfg.get("log_dir")) + self.assertEqual(True, cfg.get("prepend_cwd")) - runcfg = RunConfig() - load(profile="test", scheduler="local_cwd", f=StringIO(_CONFIG), runcfg=runcfg) - self.assertEqual(None, runcfg.get("log_dir")) - self.assertEqual(False, runcfg.get("prepend_cwd")) + cfg = RunConfig() + load(profile="test", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg) + self.assertEqual(None, cfg.get("log_dir")) + self.assertEqual(False, cfg.get("prepend_cwd")) - runcfg = RunConfig() - load(profile="alpha", scheduler="local_cwd", f=StringIO(_CONFIG), runcfg=runcfg) - self.assertEqual("/tmp/logs", runcfg.get("log_dir")) - self.assertEqual(None, runcfg.get("prepend_cwd")) + cfg = RunConfig() + load(profile="alpha", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg) + self.assertEqual("/tmp/logs", cfg.get("log_dir")) + self.assertEqual(None, cfg.get("prepend_cwd")) def test_no_override_load(self) -> None: - runcfg = RunConfig() - runcfg.set("log_dir", "/foo/bar") - runcfg.set("debug", 1) + cfg = RunConfig() + cfg.set("log_dir", "/foo/bar") + cfg.set("debug", 1) - load(profile="test", scheduler="local_cwd", f=StringIO(_CONFIG), runcfg=runcfg) - self.assertEqual("/foo/bar", runcfg.get("log_dir")) - self.assertEqual(1, runcfg.get("debug")) - self.assertEqual(False, runcfg.get("prepend_cwd")) + load(profile="test", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg) + self.assertEqual("/foo/bar", cfg.get("log_dir")) + self.assertEqual(1, cfg.get("debug")) + self.assertEqual(False, cfg.get("prepend_cwd")) @patch( TORCHX_GET_SCHEDULERS, @@ -192,14 +190,14 @@ def test_no_override_load(self) -> None: def test_apply(self, _) -> None: with patch(PATH_CWD, return_value=Path(self.test_dir)): with patch(PATH_HOME, return_value=Path(self.test_dir) / "home" / "bob"): - runcfg = RunConfig() - runcfg.set("s", "runtime_value") + cfg = RunConfig() + cfg.set("s", "runtime_value") - apply(profile="default", scheduler="test", runcfg=runcfg) + apply(profile="default", scheduler="test", cfg=cfg) - self.assertEqual("runtime_value", runcfg.get("s")) - self.assertEqual(100, runcfg.get("i")) - self.assertEqual(1.2, runcfg.get("f")) + self.assertEqual("runtime_value", cfg.get("s")) + self.assertEqual(100, cfg.get("i")) + self.assertEqual(1.2, cfg.get("f")) def test_dump_invalid_scheduler(self) -> None: with self.assertRaises(ValueError): @@ -215,50 +213,50 @@ def test_dump_only_required(self, _) -> None: # test scheduler has no required options hence expect empty string dump(f=sfile, required_only=True) - runcfg = RunConfig() + cfg = RunConfig() sfile.seek(0) - load(profile="default", scheduler="test", f=sfile, runcfg=runcfg) + load(profile="default", scheduler="test", f=sfile, cfg=cfg) - self.assertFalse(runcfg.cfgs) + self.assertFalse(cfg.cfgs) @patch( TORCHX_GET_SCHEDULERS, return_value={"test": TestScheduler()}, ) def test_load_invalid_runopt(self, _) -> None: - runcfg = RunConfig() + cfg = RunConfig() load( profile="default", scheduler="test", f=StringIO(_CONFIG_INVALID), - runcfg=runcfg, + cfg=cfg, ) # options in the config file but not in runopts # should be ignored (we shouldn't throw an error since # this makes things super hard to guarantee BC - stale config file will fail # to run, we don't want that) - self.assertEquals("option_that_exists", runcfg.get("s")) + self.assertEquals("option_that_exists", cfg.get("s")) def test_load_no_section(self) -> None: - runcfg = RunConfig() + cfg = RunConfig() load( profile="default", scheduler="local_cwd", f=StringIO(), - runcfg=runcfg, + cfg=cfg, ) # is empty - self.assertFalse(runcfg.cfgs) + self.assertFalse(cfg.cfgs) load( profile="default", scheduler="local_cwd", f=StringIO("[default.scheduler_args.local_cwd]\n"), - runcfg=runcfg, + cfg=cfg, ) # still empty - self.assertFalse(runcfg.cfgs) + self.assertFalse(cfg.cfgs) @patch( TORCHX_GET_SCHEDULERS, @@ -270,12 +268,12 @@ def test_dump_and_load_all_runopt_types(self, _) -> None: sfile.seek(0) - runcfg = RunConfig() - load(profile="default", scheduler="test", f=sfile, runcfg=runcfg) + cfg = RunConfig() + load(profile="default", scheduler="test", f=sfile, cfg=cfg) # all runopts in the TestScheduler have defaults, just check against those for opt_name, opt in TestScheduler().run_opts(): - self.assertEqual(runcfg.get(opt_name), opt.default) + self.assertEqual(cfg.get(opt_name), opt.default) def test_dump_and_load_all_registered_schedulers(self) -> None: # dump all the runopts for all registered schedulers @@ -284,11 +282,11 @@ def test_dump_and_load_all_registered_schedulers(self) -> None: sfile = StringIO() dump(sfile) - + print(sfile.getvalue()) for sched_name, sched in get_schedulers(session_name="_").items(): sfile.seek(0) # reset the file pos - runcfg = RunConfig() - load(profile="default", scheduler=sched_name, f=sfile, runcfg=runcfg) + cfg = RunConfig() + load(profile="default", scheduler=sched_name, f=sfile, cfg=cfg) for opt_name, _ in sched.run_opts(): - self.assertTrue(opt_name in runcfg.cfgs) + self.assertTrue(opt_name in cfg.cfgs) diff --git a/torchx/specs/__init__.py b/torchx/specs/__init__.py index f061eb4d8..f2e11b71c 100644 --- a/torchx/specs/__init__.py +++ b/torchx/specs/__init__.py @@ -11,38 +11,40 @@ from torchx.util.entrypoints import load_group from .api import ( # noqa: F401 F403 - SchedulerBackend, - Resource, - NULL_RESOURCE, ALL, MISSING, NONE, - macros, - RetryPolicy, - Role, + NULL_RESOURCE, AppDef, + AppDryRunInfo, + AppHandle, AppState, - is_terminal, - ReplicaStatus, - ReplicaState, - RoleStatus, AppStatus, ConfigValue, - RunConfig, - AppDryRunInfo, - get_type_name, - runopts, InvalidRunConfigException, MalformedAppHandleException, - UnknownSchedulerException, - AppHandle, + ReplicaState, + ReplicaStatus, + Resource, + RetryPolicy, + Role, + RoleStatus, + RunConfig, + SchedulerBackend, UnknownAppException, + UnknownSchedulerException, + from_function, + get_argparse_param_type, + get_type_name, + is_terminal, + macros, make_app_handle, parse_app_handle, - get_argparse_param_type, - from_function, + runopt, + runopts, ) + GiB: int = 1024 diff --git a/torchx/specs/api.py b/torchx/specs/api.py index cb9bf354f..d068ae010 100644 --- a/torchx/specs/api.py +++ b/torchx/specs/api.py @@ -509,7 +509,7 @@ def get_type_name(tp: Type[ConfigValue]) -> str: @dataclass -class Runopt: +class runopt: """ Represents the metadata about the specific run option """ @@ -554,9 +554,9 @@ class runopts: """ def __init__(self) -> None: - self._opts: Dict[str, Runopt] = {} + self._opts: Dict[str, runopt] = {} - def __iter__(self) -> Iterator[Tuple[str, Runopt]]: + def __iter__(self) -> Iterator[Tuple[str, runopt]]: return self._opts.items().__iter__() @staticmethod @@ -573,7 +573,7 @@ def is_type(obj: ConfigValue, tp: Type[ConfigValue]) -> bool: else: return False - def get(self, name: str) -> Optional[Runopt]: + def get(self, name: str) -> Optional[runopt]: """ Returns option if any was registered, or None otherwise """ @@ -603,7 +603,7 @@ def add( f" Given: {default} ({type(default).__name__})" ) - self._opts[cfg_key] = Runopt(default, type_, required, help) + self._opts[cfg_key] = runopt(default, type_, required, help) def resolve(self, config: RunConfig) -> RunConfig: """ @@ -623,9 +623,8 @@ def resolve(self, config: RunConfig) -> RunConfig: # check required opt if runopt.is_required and val is None: raise InvalidRunConfigException( - f"Required run option: {cfg_key}, must be provided and not None", + f"Required run option: {cfg_key}, must be provided and not `None`", config, - self, ) # check type (None matches all types) @@ -634,7 +633,6 @@ def resolve(self, config: RunConfig) -> RunConfig: f"Run option: {cfg_key}, must be of type: {get_type_name(runopt.opt_type)}," f" but was: {val} ({type(val).__name__})", config, - self, ) # not required and not set, set to default @@ -678,10 +676,9 @@ class InvalidRunConfigException(Exception): type mismatch. """ - def __init__( - self, invalid_reason: str, run_config: RunConfig, runopts: "runopts" - ) -> None: - super().__init__(f"{invalid_reason}. Given: {run_config}, Expected: {runopts}") + def __init__(self, invalid_reason: str, cfg: RunConfig) -> None: + given = str(cfg) if cfg.cfgs else "" + super().__init__(f"{invalid_reason}. Given: {given}") class MalformedAppHandleException(Exception): From 95ea9f53e29e399522cd18f12e4bcf81781ea82e Mon Sep 17 00:00:00 2001 From: Kiuk Chung Date: Thu, 14 Oct 2021 16:05:58 -0700 Subject: [PATCH 04/14] (torchx/cli) fix misaligned log msgs for torchx run Summary: closes: https://github.com/pytorch/torchx/issues/251 Reviewed By: d4l3k Differential Revision: D31659813 fbshipit-source-id: 150ea152339adf84d19f1eb8b4a2e083901705ab --- torchx/cli/cmd_run.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/torchx/cli/cmd_run.py b/torchx/cli/cmd_run.py index 1d1189db2..1a800e0a4 100644 --- a/torchx/cli/cmd_run.py +++ b/torchx/cli/cmd_run.py @@ -66,9 +66,9 @@ def _builtins(self) -> Dict[str, _Component]: def run(self, args: argparse.Namespace) -> None: builtin_components = self._builtins() num_builtins = len(builtin_components) - logger.info(f"Found {num_builtins} builtin configs:") + print(f"Found {num_builtins} builtin configs:") for i, component in enumerate(builtin_components.values()): - logger.info(f" {i + 1:2d}. {component.name}") + print(f" {i + 1:2d}. {component.name}") class CmdRun(SubCommand): @@ -139,11 +139,12 @@ def _run(self, runner: Runner, args: argparse.Namespace) -> Optional[str]: if args.dryrun: app_dryrun_info = cast(specs.AppDryRunInfo, result) - logger.info("=== APPLICATION ===") - logger.info(pformat(asdict(app_dryrun_info._app), indent=2, width=80)) + logger.info( + "\n=== APPLICATION ===\n" + f"{pformat(asdict(app_dryrun_info._app), indent=2, width=80)}" + ) - logger.info("=== SCHEDULER REQUEST ===") - logger.info(app_dryrun_info) + logger.info("\n=== SCHEDULER REQUEST ===\n" f"{app_dryrun_info}") return else: app_handle = cast(specs.AppHandle, result) @@ -153,7 +154,6 @@ def _run(self, runner: Runner, args: argparse.Namespace) -> Optional[str]: if args.scheduler.startswith("local"): self._wait_and_exit(runner, app_handle) else: - logger.info("=== RUN RESULT ===") logger.info(f"Launched app: {app_handle}") status = runner.status(app_handle) logger.info(status) From 6f91834b597f379d04b725570c69c7c0f51ee819 Mon Sep 17 00:00:00 2001 From: Aliaksandr Ivanou Date: Thu, 14 Oct 2021 18:40:55 -0700 Subject: [PATCH 05/14] Addressed comments --- torchx/components/interpret.py | 2 +- torchx/examples/apps/lightning_classy_vision/component.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/torchx/components/interpret.py b/torchx/components/interpret.py index 178da14e9..030185457 100644 --- a/torchx/components/interpret.py +++ b/torchx/components/interpret.py @@ -16,6 +16,6 @@ See the :ref:`examples_apps/lightning_classy_vision/interpret:Model Interpretability App Example` and the corresponding -:ref:`Interpret component definition` +:ref:`Interpret component definition` for an example of how to use Captum with TorchX. """ diff --git a/torchx/examples/apps/lightning_classy_vision/component.py b/torchx/examples/apps/lightning_classy_vision/component.py index 4e069e066..300f63e10 100644 --- a/torchx/examples/apps/lightning_classy_vision/component.py +++ b/torchx/examples/apps/lightning_classy_vision/component.py @@ -258,9 +258,9 @@ def trainer_dist( # %% -# Model Interpret -# ################# -# Defines interpret component +# Interpreting the Model +# ####################### +# Defines a component that interprets the model # # Train a single trainer example: :ref:`examples_apps/lightning_classy_vision/component:Single Trainer Component` # And use the following cmd to try out: From 71958727d36c51fd65d9b0d790b47e3a97ff964a Mon Sep 17 00:00:00 2001 From: Tristan Rice Date: Thu, 14 Oct 2021 19:24:42 -0700 Subject: [PATCH 06/14] CI: use OIDC (#256) Summary: This switches our integration tests to use the GitHub OpenID Connect credentials provider instead of using hard coded AWS session tokens. This will issue tokens that last for 1 hour so should be a lot more secure (and trackable) than before. https://awsteele.com/blog/2021/09/15/aws-federation-comes-to-github-actions.html Pull Request resolved: https://github.com/pytorch/torchx/pull/256 Test Plan: CI created PR from external repo to verify they can't generate tokens https://github.com/pytorch/torchx/pull/257 Reviewed By: kiukchung Differential Revision: D31674489 Pulled By: d4l3k fbshipit-source-id: 5936c64794816eb9fafe76899af44e2f865c64df --- .../components-integration-tests.yaml | 29 +++++++++++------ .github/workflows/kfp-integration-tests.yaml | 29 +++++++++++------ ...bernetes-dist-train-integration-tests.yaml | 31 ++++++++++++------- 3 files changed, 58 insertions(+), 31 deletions(-) diff --git a/.github/workflows/components-integration-tests.yaml b/.github/workflows/components-integration-tests.yaml index 006812636..8ee7b8e7d 100644 --- a/.github/workflows/components-integration-tests.yaml +++ b/.github/workflows/components-integration-tests.yaml @@ -9,6 +9,9 @@ on: jobs: components-launch: runs-on: ubuntu-18.04 + permissions: + id-token: write + contents: read steps: - name: Setup Python uses: actions/setup-python@v2 @@ -17,22 +20,30 @@ jobs: architecture: x64 - name: Checkout TorchX uses: actions/checkout@v2 - - name: Configure Kube Config + - name: Configure AWS env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} + run: | + if [ -n "$AWS_ROLE_ARN" ]; then + export AWS_WEB_IDENTITY_TOKEN_FILE=/tmp/awscreds + export AWS_DEFAULT_REGION=us-west-2 + + echo AWS_WEB_IDENTITY_TOKEN_FILE=$AWS_WEB_IDENTITY_TOKEN_FILE >> $GITHUB_ENV + echo AWS_ROLE_ARN=$AWS_ROLE_ARN >> $GITHUB_ENV + echo AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION >> $GITHUB_ENV + + curl -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" "$ACTIONS_ID_TOKEN_REQUEST_URL" | jq -r '.value' > $AWS_WEB_IDENTITY_TOKEN_FILE + fi + - name: Configure Kube Config run: | set -eux - if [ -n "$AWS_ACCESS_KEY_ID" ]; then + if [ -n "$AWS_ROLE_ARN" ]; then aws eks update-kubeconfig --region=us-west-2 --name=${{ secrets.EKS_CLUSTER_NAME }} fi - name: Configure Docker - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} run: | set -eux - if [ -n "$AWS_ACCESS_KEY_ID" ]; then + if [ -n "$AWS_ROLE_ARN" ]; then aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 495572122715.dkr.ecr.us-west-2.amazonaws.com fi - name: Install dependencies @@ -42,8 +53,6 @@ jobs: pip install -e .[kubernetes] - name: Run Components Integration Tests env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} INTEGRATION_TEST_STORAGE: ${{ secrets.INTEGRATION_TEST_STORAGE }} CONTAINER_REPO: ${{ secrets.CONTAINER_REPO }} run: scripts/component_integration_tests.py diff --git a/.github/workflows/kfp-integration-tests.yaml b/.github/workflows/kfp-integration-tests.yaml index 53591852e..c1f386f7f 100644 --- a/.github/workflows/kfp-integration-tests.yaml +++ b/.github/workflows/kfp-integration-tests.yaml @@ -9,6 +9,9 @@ on: jobs: kfp-launch: runs-on: ubuntu-18.04 + permissions: + id-token: write + contents: read steps: - name: Install kubectl # More info: https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/ @@ -18,13 +21,24 @@ jobs: mkdir -p ~/.local/bin/kubectl mv ./kubectl ~/.local/bin/kubectl export PATH=$PATH:~/.local/bin/kubectl - - name: Configure Kube Config + - name: Configure AWS env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} + run: | + if [ -n "$AWS_ROLE_ARN" ]; then + export AWS_WEB_IDENTITY_TOKEN_FILE=/tmp/awscreds + export AWS_DEFAULT_REGION=us-west-2 + + echo AWS_WEB_IDENTITY_TOKEN_FILE=$AWS_WEB_IDENTITY_TOKEN_FILE >> $GITHUB_ENV + echo AWS_ROLE_ARN=$AWS_ROLE_ARN >> $GITHUB_ENV + echo AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION >> $GITHUB_ENV + + curl -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" "$ACTIONS_ID_TOKEN_REQUEST_URL" | jq -r '.value' > $AWS_WEB_IDENTITY_TOKEN_FILE + fi + - name: Configure Kube Config run: | set -eux - if [ -n "$AWS_ACCESS_KEY_ID" ]; then + if [ -n "$AWS_ROLE_ARN" ]; then aws eks update-kubeconfig --region=us-west-2 --name=${{ secrets.EKS_CLUSTER_NAME }} fi - name: Setup Python @@ -35,12 +49,9 @@ jobs: - name: Checkout TorchX uses: actions/checkout@v2 - name: Configure Docker - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} run: | set -eux - if [ -n "$AWS_ACCESS_KEY_ID" ]; then + if [ -n "$AWS_ROLE_ARN" ]; then aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 495572122715.dkr.ecr.us-west-2.amazonaws.com fi - name: Install dependencies @@ -50,8 +61,6 @@ jobs: python setup.py install - name: Run KFP Integration Tests env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} KFP_NAMESPACE: ${{ secrets.KFP_NAMESPACE }} INTEGRATION_TEST_STORAGE: ${{ secrets.INTEGRATION_TEST_STORAGE }} CONTAINER_REPO: ${{ secrets.CONTAINER_REPO }} diff --git a/.github/workflows/kubernetes-dist-train-integration-tests.yaml b/.github/workflows/kubernetes-dist-train-integration-tests.yaml index 8088d5baa..87b100da8 100644 --- a/.github/workflows/kubernetes-dist-train-integration-tests.yaml +++ b/.github/workflows/kubernetes-dist-train-integration-tests.yaml @@ -9,6 +9,9 @@ on: jobs: kubernetes-launch: runs-on: ubuntu-18.04 + permissions: + id-token: write + contents: read steps: - name: Setup Python uses: actions/setup-python@v2 @@ -17,22 +20,30 @@ jobs: architecture: x64 - name: Checkout TorchX uses: actions/checkout@v2 - - name: Configure Kube Config + - name: Configure AWS env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} + run: | + if [ -n "$AWS_ROLE_ARN" ]; then + export AWS_WEB_IDENTITY_TOKEN_FILE=/tmp/awscreds + export AWS_DEFAULT_REGION=us-west-2 + + echo AWS_WEB_IDENTITY_TOKEN_FILE=$AWS_WEB_IDENTITY_TOKEN_FILE >> $GITHUB_ENV + echo AWS_ROLE_ARN=$AWS_ROLE_ARN >> $GITHUB_ENV + echo AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION >> $GITHUB_ENV + + curl -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" "$ACTIONS_ID_TOKEN_REQUEST_URL" | jq -r '.value' > $AWS_WEB_IDENTITY_TOKEN_FILE + fi + - name: Configure Kube Config run: | set -eux - if [ -n "$AWS_ACCESS_KEY_ID" ]; then + if [ -n "$AWS_ROLE_ARN" ]; then aws eks update-kubeconfig --region=us-west-2 --name=${{ secrets.EKS_CLUSTER_NAME }} fi - name: Configure Docker - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} run: | set -eux - if [ -n "$AWS_ACCESS_KEY_ID" ]; then + if [ -n "$AWS_ROLE_ARN" ]; then aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 495572122715.dkr.ecr.us-west-2.amazonaws.com fi - name: Install dependencies @@ -41,12 +52,10 @@ jobs: pip install -e .[kubernetes] - name: Run Kubernetes Integration Tests env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} INTEGRATION_TEST_STORAGE: ${{ secrets.INTEGRATION_TEST_STORAGE }} CONTAINER_REPO: ${{ secrets.CONTAINER_REPO }} run: | - if [ -z "$AWS_ACCESS_KEY_ID" ]; then + if [ -z "$AWS_ROLE_ARN" ]; then # only dryrun if no secrets ARGS="--dryrun" else From 37311e47ef5f2c9ce956e224e9b568c5cf2912d2 Mon Sep 17 00:00:00 2001 From: Aliaksandr Ivanou Date: Thu, 14 Oct 2021 19:30:50 -0700 Subject: [PATCH 07/14] Make `output_path` optional Summary: Since we removed distributed sum, we need to use this example to run fb internal tests. For internal tests, we don't need the `output_path`, which introduces around ~200 mb of data on each run Reviewed By: kiukchung Differential Revision: D31661378 fbshipit-source-id: 098bf9f5be9302e7d8cced672ba9cf7eaf8b32e6 --- .../apps/lightning_classy_vision/train.py | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/torchx/examples/apps/lightning_classy_vision/train.py b/torchx/examples/apps/lightning_classy_vision/train.py index 0db67f097..57bfade7c 100755 --- a/torchx/examples/apps/lightning_classy_vision/train.py +++ b/torchx/examples/apps/lightning_classy_vision/train.py @@ -21,7 +21,7 @@ import os import sys import tempfile -from typing import List +from typing import List, Optional import pytorch_lightning as pl import torch @@ -72,8 +72,7 @@ def parse_args(argv: List[str]) -> argparse.Namespace: parser.add_argument( "--output_path", type=str, - help="path to place checkpoints and model outputs", - required=True, + help="path to place checkpoints and model outputs, if not specified, checkpoints are not saved", ) parser.add_argument( "--log_path", @@ -94,6 +93,16 @@ def get_gpu_devices() -> int: return torch.cuda.device_count() +def get_model_checkpoint(args: argparse.Namespace) -> Optional[ModelCheckpoint]: + if not args.output_path: + return None + return ModelCheckpoint( + monitor="train_loss", + dirpath=args.output_path, + save_last=True, + ) + + def main(argv: List[str]) -> None: with tempfile.TemporaryDirectory() as tmpdir: args = parse_args(argv) @@ -117,11 +126,10 @@ def main(argv: List[str]) -> None: ) # Setup model checkpointing - checkpoint_callback = ModelCheckpoint( - monitor="train_loss", - dirpath=args.output_path, - save_last=True, - ) + checkpoint_callback = get_model_checkpoint(args) + callbacks = [] + if checkpoint_callback: + callbacks.append(checkpoint_callback) if args.load_path: print(f"loading checkpoint: {args.load_path}...") model.load_from_checkpoint(checkpoint_path=args.load_path) From 19925a0beef6cb3783c623e3d2c8b533cd235f8c Mon Sep 17 00:00:00 2001 From: Aliaksandr Ivanou Date: Thu, 14 Oct 2021 19:59:36 -0700 Subject: [PATCH 08/14] Do not check `torchx.components.base` module for components (#258) Summary: Pull Request resolved: https://github.com/pytorch/torchx/pull/258 Do not check `torchx.components.base` module for components Reviewed By: kiukchung Differential Revision: D31664832 fbshipit-source-id: 3de72047810ff8c2478e036ce5626459d4c073af --- torchx/specs/finder.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/torchx/specs/finder.py b/torchx/specs/finder.py index eb8ffd2b9..3dc2a74bd 100644 --- a/torchx/specs/finder.py +++ b/torchx/specs/finder.py @@ -119,9 +119,13 @@ def _get_components_from_dir( search_pattern = os.path.join(search_dir, "**", "*.py") component_defs = [] for filepath in glob.glob(search_pattern, recursive=True): - module = self._try_load_module( - self._get_module_name(filepath, search_dir, base_module) - ) + module_name = self._get_module_name(filepath, search_dir, base_module) + # TODO(aivanou): move `torchx.components.base` to `torchx.specs`, since + # there is nothing related to components in `torchx.components.base` + # see https://github.com/pytorch/torchx/issues/261 + if module_name.startswith("torchx.components.base"): + continue + module = self._try_load_module(module_name) defs = self._get_components_from_module(base_module, module) component_defs += defs return component_defs From 29f1e5e19c67c7495959856fd1cd367a33dc62ec Mon Sep 17 00:00:00 2001 From: Kiuk Chung Date: Thu, 14 Oct 2021 23:26:41 -0700 Subject: [PATCH 09/14] (torchx/config) remove profiles from .torchxconfig, remove hierarchical loading, and move config loading to cli from runner (#260) Summary: Pull Request resolved: https://github.com/pytorch/torchx/pull/260 1. Removes profiles from .torchxconfig (also removes .cfg suffix from section) 2. Removes hierarchical loading (only picks up .torchxconfig from CWD - project dir) 3. Removes config application from runner and moves it to CLI only Reviewed By: d4l3k Differential Revision: D31674537 fbshipit-source-id: 937c3375771316b2bf2f1d65a560d7311031d4fa --- torchx/cli/cmd_run.py | 9 ++-- torchx/runner/api.py | 4 -- torchx/runner/config.py | 56 +++++++++++++++-------- torchx/runner/test/config_test.py | 76 ++++++++++++++----------------- 4 files changed, 78 insertions(+), 67 deletions(-) diff --git a/torchx/cli/cmd_run.py b/torchx/cli/cmd_run.py index 1a800e0a4..32e5b8f01 100644 --- a/torchx/cli/cmd_run.py +++ b/torchx/cli/cmd_run.py @@ -15,7 +15,7 @@ import torchx.specs as specs from pyre_extensions import none_throws from torchx.cli.cmd_base import SubCommand -from torchx.runner import Runner, get_runner +from torchx.runner import Runner, config, get_runner from torchx.schedulers import get_default_scheduler_name, get_scheduler_factories from torchx.specs.finder import ( ComponentNotFoundException, @@ -53,6 +53,7 @@ def _parse_run_config(arg: str, scheduler_opts: specs.runopts) -> specs.RunConfi option_type = option.opt_type typed_value = _convert_to_option_type(value, option_type) conf.set(key, typed_value) + return conf @@ -114,7 +115,9 @@ def add_arguments(self, subparser: argparse.ArgumentParser) -> None: def _run(self, runner: Runner, args: argparse.Namespace) -> Optional[str]: run_opts = get_runner().run_opts() scheduler_opts = run_opts[args.scheduler] - scheduler_args = _parse_run_config(args.scheduler_args, scheduler_opts) + cfg = _parse_run_config(args.scheduler_args, scheduler_opts) + config.apply(scheduler=args.scheduler, cfg=cfg) + if len(args.conf_args) < 1: none_throws(self._subparser).error( "the following arguments are required: conf_file, conf_args" @@ -129,7 +132,7 @@ def _run(self, runner: Runner, args: argparse.Namespace) -> Optional[str]: conf_file, conf_args, args.scheduler, - scheduler_args, + cfg, dryrun=args.dryrun, ) except (ComponentValidationException, ComponentNotFoundException) as e: diff --git a/torchx/runner/api.py b/torchx/runner/api.py index 4da92d365..8e196a048 100644 --- a/torchx/runner/api.py +++ b/torchx/runner/api.py @@ -13,7 +13,6 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union from pyre_extensions import none_throws -from torchx.runner import config from torchx.runner.events import log_event from torchx.schedulers import get_schedulers from torchx.schedulers.api import Scheduler @@ -262,9 +261,6 @@ def dryrun( ) cfg = cfg or RunConfig() - # TODO enable profiles - https://github.com/pytorch/torchx/issues/248 - config.apply(scheduler=scheduler, cfg=cfg, profile="default") - sched = self._scheduler(scheduler) sched._validate(app, scheduler) dryrun_info = sched.submit_dryrun(app, cfg) diff --git a/torchx/runner/config.py b/torchx/runner/config.py index d68613555..b02a2af63 100644 --- a/torchx/runner/config.py +++ b/torchx/runner/config.py @@ -72,7 +72,7 @@ def dump( :: - [default.kubernetes.cfg] + [kubernetes] namespace = default queue = #FIXME (str)Volcano queue to schedule job in @@ -89,7 +89,7 @@ def dump( for sched_name in scheds: sched = _get_scheduler(sched_name) - section = f"default.{sched_name}.cfg" + section = f"{sched_name}" config.add_section(section) for opt_name, opt in sched.run_opts(): @@ -114,33 +114,51 @@ def dump( config.write(f, space_around_delimiters=True) -def apply(scheduler: str, cfg: RunConfig, profile: str = "default") -> None: +def apply(scheduler: str, cfg: RunConfig, dirs: Optional[List[str]] = None) -> None: """ - Loads .torchxconfig files from predefined locations according - to a load hierarchy and applies the loaded configs into the - given ``runcfg``. The load hierarchy is as follows (in order of precedence): + Loads a ``.torchxconfig`` INI file from the specified directories in + preceding order and applies the run configs for the scheduler onto + the given ``cfg``. - #. ``runcfg`` given to this function - #. configs loaded from ``$HOME/.torchxconfig`` - #. configs loaded from ``$CWD/.torchxconfig`` + If no ``dirs`` is specified, then it looks for ``.torchxconfig`` in the + current working directory. If a specified directory does not have ``.torchxconfig`` + then it is ignored. - Note that load hierarchy does NOT overwrite, but rather adds. - That is, the configs already present in ``runcfg`` are not - overridden during the load. + Note that the configs already present in the given ``cfg`` take precedence + over the ones in the config file and only new configs are added. The same holds + true for the configs loaded in list order. + + For instance if ``cfg = {"foo": "bar"}`` and the config file is: + + :: + + # dir_1/.torchxconfig + [local_cwd] + foo = baz + hello = world + + # dir_2/.torchxconfig + [local_cwd] + hello = bob + + + Then after the method call, ``cfg = {"foo": "bar", "hello": "world"}``. """ - lookup_dirs = [Path.home(), Path.cwd()] - for d in lookup_dirs: - configfile = d / ".torchxconfig" + if not dirs: + dirs = [str(Path.cwd())] + + for d in dirs: + configfile = Path(d) / ".torchxconfig" if configfile.exists(): log.info(f"loading configs from {configfile}") with open(str(configfile), "r") as f: - load(scheduler, f, cfg, profile) + load(scheduler, f, cfg) -def load(scheduler: str, f: TextIO, cfg: RunConfig, profile: str = "default") -> None: +def load(scheduler: str, f: TextIO, cfg: RunConfig) -> None: """ - loads the section ``[{profile}.scheduler_args.{scheduler}]`` from the given + loads the section ``[{scheduler}]`` from the given configfile ``f`` (in .INI format) into the provided ``runcfg``, only adding configs that are NOT currently in the given ``runcfg`` (e.g. does not override existing values in ``runcfg``). If no section is found, does nothing. @@ -151,7 +169,7 @@ def load(scheduler: str, f: TextIO, cfg: RunConfig, profile: str = "default") -> runopts = _get_scheduler(scheduler).run_opts() - section = f"{profile}.{scheduler}.cfg" + section = f"{scheduler}" if config.has_section(section): for name, value in config.items(section): if name in cfg.cfgs: diff --git a/torchx/runner/test/config_test.py b/torchx/runner/test/config_test.py index ad34ba0cb..dc6de5782 100644 --- a/torchx/runner/test/config_test.py +++ b/torchx/runner/test/config_test.py @@ -102,35 +102,27 @@ def run_opts(self) -> runopts: return opts -_CONFIG = """[default.local_cwd.cfg] +_CONFIG = """[local_cwd] log_dir = /home/bob/logs prepend_cwd = True - -[test.local_cwd.cfg] -log_dir = None -prepend_cwd = False - -[alpha.local_cwd.cfg] -log_dir = /tmp/logs """ -_CONFIG_INVALID = """[default.test.cfg] +_CONFIG_INVALID = """[test] a_run_opt_that = does_not_exist s = option_that_exists """ -_TEAM_CONFIG = """[default.test.cfg] +_TEAM_CONFIG = """[test] s = team_default i = 50 f = 1.2 """ -_MY_CONFIG = """[default.test.cfg] +_MY_CONFIG = """[test] s = my_default i = 100 """ -PATH_HOME = "torchx.runner.config.Path.home" PATH_CWD = "torchx.runner.config.Path.cwd" TORCHX_GET_SCHEDULERS = "torchx.runner.config.get_schedulers" @@ -159,45 +151,50 @@ def _write(self, filename: str, content: str) -> Path: def test_load(self) -> None: cfg = RunConfig() - load(profile="default", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg) + load(scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg) self.assertEqual("/home/bob/logs", cfg.get("log_dir")) self.assertEqual(True, cfg.get("prepend_cwd")) - cfg = RunConfig() - load(profile="test", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg) - self.assertEqual(None, cfg.get("log_dir")) - self.assertEqual(False, cfg.get("prepend_cwd")) - - cfg = RunConfig() - load(profile="alpha", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg) - self.assertEqual("/tmp/logs", cfg.get("log_dir")) - self.assertEqual(None, cfg.get("prepend_cwd")) - def test_no_override_load(self) -> None: cfg = RunConfig() cfg.set("log_dir", "/foo/bar") cfg.set("debug", 1) - load(profile="test", scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg) + load(scheduler="local_cwd", f=StringIO(_CONFIG), cfg=cfg) self.assertEqual("/foo/bar", cfg.get("log_dir")) self.assertEqual(1, cfg.get("debug")) - self.assertEqual(False, cfg.get("prepend_cwd")) + self.assertEqual(True, cfg.get("prepend_cwd")) @patch( TORCHX_GET_SCHEDULERS, return_value={"test": TestScheduler()}, ) - def test_apply(self, _) -> None: + def test_apply_default(self, _) -> None: with patch(PATH_CWD, return_value=Path(self.test_dir)): - with patch(PATH_HOME, return_value=Path(self.test_dir) / "home" / "bob"): - cfg = RunConfig() - cfg.set("s", "runtime_value") + cfg = RunConfig() + cfg.set("s", "runtime_value") + + apply(scheduler="test", cfg=cfg) - apply(profile="default", scheduler="test", cfg=cfg) + self.assertEqual("runtime_value", cfg.get("s")) + self.assertEqual(50, cfg.get("i")) + self.assertEqual(1.2, cfg.get("f")) - self.assertEqual("runtime_value", cfg.get("s")) - self.assertEqual(100, cfg.get("i")) - self.assertEqual(1.2, cfg.get("f")) + @patch( + TORCHX_GET_SCHEDULERS, + return_value={"test": TestScheduler()}, + ) + def test_apply_dirs(self, _) -> None: + cfg = RunConfig() + cfg.set("s", "runtime_value") + apply( + scheduler="test", + cfg=cfg, + dirs=[str(Path(self.test_dir) / "home" / "bob"), self.test_dir], + ) + self.assertEqual("runtime_value", cfg.get("s")) + self.assertEqual(100, cfg.get("i")) + self.assertEqual(1.2, cfg.get("f")) def test_dump_invalid_scheduler(self) -> None: with self.assertRaises(ValueError): @@ -215,7 +212,7 @@ def test_dump_only_required(self, _) -> None: cfg = RunConfig() sfile.seek(0) - load(profile="default", scheduler="test", f=sfile, cfg=cfg) + load(scheduler="test", f=sfile, cfg=cfg) self.assertFalse(cfg.cfgs) @@ -226,7 +223,6 @@ def test_dump_only_required(self, _) -> None: def test_load_invalid_runopt(self, _) -> None: cfg = RunConfig() load( - profile="default", scheduler="test", f=StringIO(_CONFIG_INVALID), cfg=cfg, @@ -241,7 +237,6 @@ def test_load_invalid_runopt(self, _) -> None: def test_load_no_section(self) -> None: cfg = RunConfig() load( - profile="default", scheduler="local_cwd", f=StringIO(), cfg=cfg, @@ -250,9 +245,8 @@ def test_load_no_section(self) -> None: self.assertFalse(cfg.cfgs) load( - profile="default", scheduler="local_cwd", - f=StringIO("[default.scheduler_args.local_cwd]\n"), + f=StringIO("[scheduler_args.local_cwd]\n"), cfg=cfg, ) # still empty @@ -269,7 +263,7 @@ def test_dump_and_load_all_runopt_types(self, _) -> None: sfile.seek(0) cfg = RunConfig() - load(profile="default", scheduler="test", f=sfile, cfg=cfg) + load(scheduler="test", f=sfile, cfg=cfg) # all runopts in the TestScheduler have defaults, just check against those for opt_name, opt in TestScheduler().run_opts(): @@ -282,11 +276,11 @@ def test_dump_and_load_all_registered_schedulers(self) -> None: sfile = StringIO() dump(sfile) - print(sfile.getvalue()) + for sched_name, sched in get_schedulers(session_name="_").items(): sfile.seek(0) # reset the file pos cfg = RunConfig() - load(profile="default", scheduler=sched_name, f=sfile, cfg=cfg) + load(scheduler=sched_name, f=sfile, cfg=cfg) for opt_name, _ in sched.run_opts(): self.assertTrue(opt_name in cfg.cfgs) From f00df91211ecbddedeff56ce87f39de488df2ea1 Mon Sep 17 00:00:00 2001 From: Tristan Rice Date: Fri, 15 Oct 2021 13:28:26 -0700 Subject: [PATCH 10/14] ci/docpush: add missing dependency (#264) Summary: This adds the missing `pandoc` dependency to the docpush CI step. Pull Request resolved: https://github.com/pytorch/torchx/pull/264 Test Plan: Test with docpush manually enabled on the PR https://github.com/pytorch/torchx/pull/264/checks?check_run_id=3908761230 Reviewed By: aivanou Differential Revision: D31692193 Pulled By: d4l3k fbshipit-source-id: 0fcb9b5667ec096d458d4e293c0cd1b34d402f7d --- .github/workflows/doc-build.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/doc-build.yaml b/.github/workflows/doc-build.yaml index 122ae4d4a..b84bb1780 100644 --- a/.github/workflows/doc-build.yaml +++ b/.github/workflows/doc-build.yaml @@ -60,6 +60,10 @@ jobs: set -ex git config --global user.email "runner@github.com" git config --global user.name "TorchX CI Runner" + - name: Install Dependencies + run: | + set -eux + sudo apt-get install -y pandoc - name: Build run: | set -ex From 6236614a4ef826eddec3534212ff150946e857e3 Mon Sep 17 00:00:00 2001 From: Tristan Rice Date: Fri, 15 Oct 2021 13:32:31 -0700 Subject: [PATCH 11/14] ci/slurm: use ec2 instance connect + mssh instead of using SSH keys (#265) Summary: This switches the integration tests to use ec2 instance connect w/ an assumed role instead of embedding the slurm ssh key in GitHub secrets. Pull Request resolved: https://github.com/pytorch/torchx/pull/265 Test Plan: ``` $ env SLURM_INSTANCE_MASTER=ubuntu@i-01dd4b95724eb0b4b scripts/slurmint.sh ``` CI Reviewed By: kiukchung, aivanou Differential Revision: D31695261 Pulled By: d4l3k fbshipit-source-id: 48a52e911e68bc9b18ed470a5f7e725ff58697b1 --- .../workflows/slurm-integration-tests.yaml | 26 ++++++++++++++----- scripts/slurmint.sh | 8 +++--- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/.github/workflows/slurm-integration-tests.yaml b/.github/workflows/slurm-integration-tests.yaml index cbdd61dd7..98c79b740 100644 --- a/.github/workflows/slurm-integration-tests.yaml +++ b/.github/workflows/slurm-integration-tests.yaml @@ -9,6 +9,9 @@ on: jobs: slurm: runs-on: ubuntu-18.04 + permissions: + id-token: write + contents: read steps: - name: Setup Python uses: actions/setup-python@v2 @@ -17,21 +20,32 @@ jobs: architecture: x64 - name: Checkout TorchX uses: actions/checkout@v2 + - name: Configure AWS + env: + AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }} + run: | + if [ -n "$AWS_ROLE_ARN" ]; then + export AWS_WEB_IDENTITY_TOKEN_FILE=/tmp/awscreds + export AWS_DEFAULT_REGION=us-west-2 + + echo AWS_WEB_IDENTITY_TOKEN_FILE=$AWS_WEB_IDENTITY_TOKEN_FILE >> $GITHUB_ENV + echo AWS_ROLE_ARN=$AWS_ROLE_ARN >> $GITHUB_ENV + echo AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION >> $GITHUB_ENV + + curl -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" "$ACTIONS_ID_TOKEN_REQUEST_URL" | jq -r '.value' > $AWS_WEB_IDENTITY_TOKEN_FILE + fi - name: Install Dependencies run: set -ex - pip install wheel + pip install wheel ec2instanceconnectcli - name: Run Slurm Integration Tests env: - SLURM_SSH: ${{ secrets.SLURM_SSH }} - SLURM_MASTER: ${{ secrets.SLURM_MASTER }} + SLURM_INSTANCE_MASTER: ${{ secrets.SLURM_INSTANCE_MASTER }} SLURM_KNOWN_HOST: ${{ secrets.SLURM_KNOWN_HOST }} - SLURM_IDENT: id_rsa run: | set -e - echo "$SLURM_SSH" > "$SLURM_IDENT" - chmod 600 "$SLURM_IDENT" + mkdir -p ~/.ssh echo "$SLURM_KNOWN_HOST" >> ~/.ssh/known_hosts diff --git a/scripts/slurmint.sh b/scripts/slurmint.sh index 136ff1597..6a9bd68e7 100755 --- a/scripts/slurmint.sh +++ b/scripts/slurmint.sh @@ -14,8 +14,8 @@ python setup.py bdist_wheel WHEEL="$DIST/$(ls $DIST)" -if [[ -z "${SLURM_MASTER}" ]]; then - echo "slurm master is not set, skipping test..." +if [[ -z "${SLURM_INSTANCE_MASTER}" ]]; then + echo "SLURM_INSTANCE_MASTER is not set, skipping test..." exit 0 fi @@ -25,11 +25,11 @@ VENV="$DIR/venv" function run_cmd { # shellcheck disable=SC2048,SC2086 - ssh -o ServerAliveInterval=60 "$SLURM_MASTER" -i "$SLURM_IDENT" $* + mssh -o ServerAliveInterval=60 "$SLURM_INSTANCE_MASTER" -- $* } function run_scp { - scp -i "$SLURM_IDENT" "$1" "$SLURM_MASTER:$2" + rsync -rav -e mssh "$1" "$SLURM_INSTANCE_MASTER:$2" } function cleanup { From af8114704751fea2d09a1b27824fa6de6481fb75 Mon Sep 17 00:00:00 2001 From: Aliaksandr Ivanou Date: Thu, 14 Oct 2021 11:36:21 -0700 Subject: [PATCH 12/14] Add interpret docs to example component, remove `test` arg from cv trainer --- torchx/components/interpret.py | 2 +- .../apps/lightning_classy_vision/component.py | 106 ++++++++++-------- .../apps/lightning_classy_vision/data.py | 2 +- .../apps/lightning_classy_vision/interpret.py | 12 +- .../test/train_test.py | 29 +++++ .../apps/lightning_classy_vision/train.py | 9 +- 6 files changed, 101 insertions(+), 59 deletions(-) create mode 100644 torchx/examples/apps/lightning_classy_vision/test/train_test.py diff --git a/torchx/components/interpret.py b/torchx/components/interpret.py index 6b10c5e29..178da14e9 100644 --- a/torchx/components/interpret.py +++ b/torchx/components/interpret.py @@ -16,6 +16,6 @@ See the :ref:`examples_apps/lightning_classy_vision/interpret:Model Interpretability App Example` and the corresponding -:ref:`Interpret component definition` +:ref:`Interpret component definition` for an example of how to use Captum with TorchX. """ diff --git a/torchx/examples/apps/lightning_classy_vision/component.py b/torchx/examples/apps/lightning_classy_vision/component.py index 820777ae9..ba886d47d 100644 --- a/torchx/examples/apps/lightning_classy_vision/component.py +++ b/torchx/examples/apps/lightning_classy_vision/component.py @@ -46,24 +46,24 @@ # # torchx run --scheduler local_cwd \ # ./torchx/examples/apps/lightning_classy_vision/component.py:trainer \ -# --output_path /tmp +# --output_path /tmp/$USER # # Single trainer component code: def trainer( - output_path: str, - image: str = TORCHX_IMAGE, - data_path: Optional[str] = None, - load_path: str = "", - log_path: str = "/tmp/logs", - resource: Optional[str] = None, - env: Optional[Dict[str, str]] = None, - skip_export: bool = False, - epochs: int = 1, - layers: Optional[List[int]] = None, - learning_rate: Optional[float] = None, - num_samples: int = 200, + output_path: str, + image: str = TORCHX_IMAGE, + data_path: Optional[str] = None, + load_path: str = "", + log_path: str = "/tmp/logs", + resource: Optional[str] = None, + env: Optional[Dict[str, str]] = None, + skip_export: bool = False, + epochs: int = 1, + layers: Optional[List[int]] = None, + learning_rate: Optional[float] = None, + num_samples: int = 200, ) -> torchx.AppDef: """Runs the example lightning_classy_vision app. @@ -170,19 +170,19 @@ def trainer( def trainer_dist( - output_path: str, - image: str = TORCHX_IMAGE, - data_path: Optional[str] = None, - load_path: str = "", - log_path: str = "/tmp/logs", - resource: Optional[str] = None, - env: Optional[Dict[str, str]] = None, - skip_export: bool = False, - epochs: int = 1, - nnodes: int = 1, - nproc_per_node: int = 1, - rdzv_backend: str = "etcd", - rdzv_endpoint: str = "etcd-server:2379", + output_path: str, + image: str = TORCHX_IMAGE, + data_path: Optional[str] = None, + load_path: str = "", + log_path: str = "/tmp/logs", + resource: Optional[str] = None, + env: Optional[Dict[str, str]] = None, + skip_export: bool = False, + epochs: int = 1, + nnodes: int = 1, + nproc_per_node: int = 1, + rdzv_backend: str = "etcd", + rdzv_endpoint: str = "etcd-server:2379", ) -> torchx.AppDef: """Runs the example lightning_classy_vision app. @@ -258,44 +258,58 @@ def trainer_dist( # %% -# Model Interpretability -# ####################### -# TODO(aivanou): add documentation +# Model Interpret +# ################# +# Defines interpret component +# +# Train a single trainer example: :ref:`examples_apps/lightning_classy_vision/component:Single Trainer Component` +# And use the following cmd to try out: +# +# .. code:: bash +# +# torchx run --scheduler local_cwd \ +# ./torchx/examples/apps/lightning_classy_vision/component.py:interpret \ +# --output_path /tmp/aivanou/interpret --load_path /tmp/$USER/last.ckpt def interpret( - image: str, - load_path: str, - data_path: str, - output_path: str, - resource: Optional[str] = None, + load_path: str, + output_path: str, + data_path: Optional[str] = None, + image: str = TORCHX_IMAGE, + resource: Optional[str] = None, ) -> torchx.AppDef: """Runs the model interpretability app on the model outputted by the training component. Args: - image: image to run (e.g. foobar:latest) load_path: path to load pretrained model from - data_path: path to the data to load output_path: output path for model checkpoints (e.g. file:///foo/bar) + data_path: path to the data to load + image: image to run (e.g. foobar:latest) resource: the resources to use """ + args = [ + "-m", + "torchx.examples.apps.lightning_classy_vision.interpret", + "--load_path", + load_path, + "--output_path", + output_path, + ] + if data_path: + args += [ + "--data_path", + data_path, + ] + return torchx.AppDef( name="cv-interpret", roles=[ torchx.Role( name="worker", entrypoint="python", - args=[ - "-m", - "torchx.examples.apps.lightning_classy_vision.interpret", - "--load_path", - load_path, - "--data_path", - data_path, - "--output_path", - output_path, - ], + args=args, image=image, resource=named_resources[resource] if resource diff --git a/torchx/examples/apps/lightning_classy_vision/data.py b/torchx/examples/apps/lightning_classy_vision/data.py index 56ceb9e6a..68aeced43 100644 --- a/torchx/examples/apps/lightning_classy_vision/data.py +++ b/torchx/examples/apps/lightning_classy_vision/data.py @@ -148,7 +148,7 @@ def download_data(remote_path: str, tmpdir: str) -> str: return data_path -def create_random_data(output_path: str, num_images: int = 5) -> None: +def create_random_data(output_path: str, num_images: int = 250) -> None: """ Fills the given path with randomly generated 64x64 images. This can be used for quick testing of the workflow of the model. diff --git a/torchx/examples/apps/lightning_classy_vision/interpret.py b/torchx/examples/apps/lightning_classy_vision/interpret.py index 7f709e2d2..84cd653e4 100755 --- a/torchx/examples/apps/lightning_classy_vision/interpret.py +++ b/torchx/examples/apps/lightning_classy_vision/interpret.py @@ -35,10 +35,10 @@ from torchx.examples.apps.lightning_classy_vision.data import ( TinyImageNetDataModule, download_data, + create_random_data, ) from torchx.examples.apps.lightning_classy_vision.model import TinyImageNetModel - # FIXME: captum must be imported after torch otherwise it causes python to crash if True: import numpy as np @@ -57,8 +57,7 @@ def parse_args(argv: List[str]) -> argparse.Namespace: parser.add_argument( "--data_path", type=str, - help="path to load the training data from", - required=True, + help="path to load the training data from, if not provided, random dataset will be created", ) parser.add_argument( "--output_path", @@ -91,7 +90,12 @@ def main(argv: List[str]) -> None: model.load_from_checkpoint(checkpoint_path=args.load_path) # Download and setup the data module - data_path = download_data(args.data_path, tmpdir) + if not args.data_path: + data_path = os.path.join(tmpdir, "data") + os.makedirs(data_path) + create_random_data(data_path) + else: + data_path = download_data(args.data_path, tmpdir) data = TinyImageNetDataModule( data_dir=data_path, batch_size=1, diff --git a/torchx/examples/apps/lightning_classy_vision/test/train_test.py b/torchx/examples/apps/lightning_classy_vision/test/train_test.py new file mode 100644 index 000000000..ba2e510b3 --- /dev/null +++ b/torchx/examples/apps/lightning_classy_vision/test/train_test.py @@ -0,0 +1,29 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +import torchx.examples.apps.lightning_classy_vision.interpret as interpret + + +class ModelTest(unittest.TestCase): + def test_basic(self) -> None: + model = TinyImageNetModel() + self.assertEqual(len(model.seq), 1) + out = model(torch.zeros((1, 64, 64))) + self.assertIsNotNone(out) + + def test_layer_sizes(self) -> None: + model = TinyImageNetModel( + layer_sizes=[ + 10, + 15, + ], + ) + self.assertEqual(len(model.seq), 5) + out = model(torch.zeros((1, 64, 64))) + self.assertIsNotNone(out) diff --git a/torchx/examples/apps/lightning_classy_vision/train.py b/torchx/examples/apps/lightning_classy_vision/train.py index 57bfade7c..2d2a14d55 100755 --- a/torchx/examples/apps/lightning_classy_vision/train.py +++ b/torchx/examples/apps/lightning_classy_vision/train.py @@ -56,12 +56,7 @@ def parse_args(argv: List[str]) -> argparse.Namespace: parser.add_argument( "--batch_size", type=int, default=32, help="batch size to use for training" ) - parser.add_argument("--num_samples", type=int, default=None, help="num_samples") - parser.add_argument( - "--test", - help="Sets to test mode, training on a much smaller set of randomly generated images", - action="store_true", - ) + parser.add_argument("--num_samples", type=int, default=10, help="num_samples") parser.add_argument( "--data_path", type=str, @@ -122,7 +117,7 @@ def main(argv: List[str]) -> None: data = TinyImageNetDataModule( data_dir=data_path, batch_size=args.batch_size, - num_samples=5 if args.test else args.num_samples, + num_samples=args.num_samples, ) # Setup model checkpointing From c18b7d68244adc0b7a91ea85efa0b31fcc858884 Mon Sep 17 00:00:00 2001 From: Aliaksandr Ivanou Date: Thu, 14 Oct 2021 11:40:34 -0700 Subject: [PATCH 13/14] Resolve lint errors --- .../apps/lightning_classy_vision/component.py | 60 +++++++++---------- .../test/train_test.py | 29 --------- 2 files changed, 30 insertions(+), 59 deletions(-) delete mode 100644 torchx/examples/apps/lightning_classy_vision/test/train_test.py diff --git a/torchx/examples/apps/lightning_classy_vision/component.py b/torchx/examples/apps/lightning_classy_vision/component.py index ba886d47d..4e069e066 100644 --- a/torchx/examples/apps/lightning_classy_vision/component.py +++ b/torchx/examples/apps/lightning_classy_vision/component.py @@ -52,18 +52,18 @@ def trainer( - output_path: str, - image: str = TORCHX_IMAGE, - data_path: Optional[str] = None, - load_path: str = "", - log_path: str = "/tmp/logs", - resource: Optional[str] = None, - env: Optional[Dict[str, str]] = None, - skip_export: bool = False, - epochs: int = 1, - layers: Optional[List[int]] = None, - learning_rate: Optional[float] = None, - num_samples: int = 200, + output_path: str, + image: str = TORCHX_IMAGE, + data_path: Optional[str] = None, + load_path: str = "", + log_path: str = "/tmp/logs", + resource: Optional[str] = None, + env: Optional[Dict[str, str]] = None, + skip_export: bool = False, + epochs: int = 1, + layers: Optional[List[int]] = None, + learning_rate: Optional[float] = None, + num_samples: int = 200, ) -> torchx.AppDef: """Runs the example lightning_classy_vision app. @@ -170,19 +170,19 @@ def trainer( def trainer_dist( - output_path: str, - image: str = TORCHX_IMAGE, - data_path: Optional[str] = None, - load_path: str = "", - log_path: str = "/tmp/logs", - resource: Optional[str] = None, - env: Optional[Dict[str, str]] = None, - skip_export: bool = False, - epochs: int = 1, - nnodes: int = 1, - nproc_per_node: int = 1, - rdzv_backend: str = "etcd", - rdzv_endpoint: str = "etcd-server:2379", + output_path: str, + image: str = TORCHX_IMAGE, + data_path: Optional[str] = None, + load_path: str = "", + log_path: str = "/tmp/logs", + resource: Optional[str] = None, + env: Optional[Dict[str, str]] = None, + skip_export: bool = False, + epochs: int = 1, + nnodes: int = 1, + nproc_per_node: int = 1, + rdzv_backend: str = "etcd", + rdzv_endpoint: str = "etcd-server:2379", ) -> torchx.AppDef: """Runs the example lightning_classy_vision app. @@ -273,11 +273,11 @@ def trainer_dist( def interpret( - load_path: str, - output_path: str, - data_path: Optional[str] = None, - image: str = TORCHX_IMAGE, - resource: Optional[str] = None, + load_path: str, + output_path: str, + data_path: Optional[str] = None, + image: str = TORCHX_IMAGE, + resource: Optional[str] = None, ) -> torchx.AppDef: """Runs the model interpretability app on the model outputted by the training component. diff --git a/torchx/examples/apps/lightning_classy_vision/test/train_test.py b/torchx/examples/apps/lightning_classy_vision/test/train_test.py deleted file mode 100644 index ba2e510b3..000000000 --- a/torchx/examples/apps/lightning_classy_vision/test/train_test.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -import unittest - -import torch -import torchx.examples.apps.lightning_classy_vision.interpret as interpret - - -class ModelTest(unittest.TestCase): - def test_basic(self) -> None: - model = TinyImageNetModel() - self.assertEqual(len(model.seq), 1) - out = model(torch.zeros((1, 64, 64))) - self.assertIsNotNone(out) - - def test_layer_sizes(self) -> None: - model = TinyImageNetModel( - layer_sizes=[ - 10, - 15, - ], - ) - self.assertEqual(len(model.seq), 5) - out = model(torch.zeros((1, 64, 64))) - self.assertIsNotNone(out) From 794e0c93e7d73f28865f78e0f9522debc665d06f Mon Sep 17 00:00:00 2001 From: Aliaksandr Ivanou Date: Thu, 14 Oct 2021 18:40:55 -0700 Subject: [PATCH 14/14] Addressed comments --- torchx/components/interpret.py | 2 +- torchx/examples/apps/lightning_classy_vision/component.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/torchx/components/interpret.py b/torchx/components/interpret.py index 178da14e9..030185457 100644 --- a/torchx/components/interpret.py +++ b/torchx/components/interpret.py @@ -16,6 +16,6 @@ See the :ref:`examples_apps/lightning_classy_vision/interpret:Model Interpretability App Example` and the corresponding -:ref:`Interpret component definition` +:ref:`Interpret component definition` for an example of how to use Captum with TorchX. """ diff --git a/torchx/examples/apps/lightning_classy_vision/component.py b/torchx/examples/apps/lightning_classy_vision/component.py index 4e069e066..300f63e10 100644 --- a/torchx/examples/apps/lightning_classy_vision/component.py +++ b/torchx/examples/apps/lightning_classy_vision/component.py @@ -258,9 +258,9 @@ def trainer_dist( # %% -# Model Interpret -# ################# -# Defines interpret component +# Interpreting the Model +# ####################### +# Defines a component that interprets the model # # Train a single trainer example: :ref:`examples_apps/lightning_classy_vision/component:Single Trainer Component` # And use the following cmd to try out: