From 3c2cdcc1129afb6081514a4b9297ae3072e96512 Mon Sep 17 00:00:00 2001 From: Kiuk Chung Date: Fri, 15 Oct 2021 12:33:39 -0700 Subject: [PATCH] (torchx/docs) add deprecation docstring to torch_dist_role and create_torch_dist_role, added docs for configfile under experimental, fixed a few docstring errors, rearranged component toctree to be grouped more logically Summary: 1. add deprecation docstring to torch_dist_role and create_torch_dist_role - we cannot add a `warnings.warn(DeprecationWarning)` because unfortunately `dist.ddp` uses `torch_dist_role` (hence adding it via `warnings` will print that warning each time `dist.ddp` is loaded). 2. added docs for configfile under experimental 3. fixed a few docstring errors 4. rearranged component toctree to be grouped more logically Differential Revision: D31697032 fbshipit-source-id: 9e7d8f0a34525ef80d63eb6c2289c82929832e37 --- docs/Makefile | 9 +- docs/source/beta.rst | 2 - docs/source/experimental/runner.config.rst | 12 +++ docs/source/index.rst | 9 +- torchx/components/__init__.py | 97 ++++++++++++++----- torchx/components/base/__init__.py | 10 +- torchx/components/base/roles.py | 6 ++ torchx/runner/api.py | 8 +- torchx/runner/config.py | 103 +++++++++++++++++++-- torchx/schedulers/api.py | 2 +- torchx/schedulers/local_scheduler.py | 4 +- 11 files changed, 211 insertions(+), 51 deletions(-) delete mode 100644 docs/source/beta.rst create mode 100644 docs/source/experimental/runner.config.rst diff --git a/docs/Makefile b/docs/Makefile index 606129977..863dd103c 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -5,7 +5,7 @@ # # You can set these variables from the command line. -SPHINXOPTS = -W +SPHINXOPTS = SPHINXBUILD = sphinx-build SPHINXPROJ = torchx SOURCEDIR = source @@ -21,7 +21,7 @@ clean: rm -rf "$(BUILDDIR)" rm -rf "$(SOURCEDIR)/examples_apps" "$(SOURCEDIR)/examples_pipelines" -.PHONY: help Makefile clean livehtml papermill +.PHONY: help Makefile clean # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). @@ -29,8 +29,3 @@ clean: @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)/$(VERSION)" $(SPHINXOPTS) $(O) # optional live version -livehtml: - sphinx-autobuild --watch ../torchx --watch ../examples --re-ignore ".*(examples_.*|.new|examples/(Dockerfile|.*.py))" "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -papermill: html - ./papermill.sh diff --git a/docs/source/beta.rst b/docs/source/beta.rst deleted file mode 100644 index 963bf6ff2..000000000 --- a/docs/source/beta.rst +++ /dev/null @@ -1,2 +0,0 @@ -Coming Soon -============= \ No newline at end of file diff --git a/docs/source/experimental/runner.config.rst b/docs/source/experimental/runner.config.rst new file mode 100644 index 000000000..76671b821 --- /dev/null +++ b/docs/source/experimental/runner.config.rst @@ -0,0 +1,12 @@ +(beta) .torchxconfig file +----------------------------- + +.. automodule:: torchx.runner.config +.. currentmodule:: torchx.runner.config + +Config API Functions +~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: apply +.. autofunction:: load +.. autofunction:: dump diff --git a/docs/source/index.rst b/docs/source/index.rst index 1bc60d14b..3eeac107e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -81,12 +81,11 @@ Components Library components/overview components/train - components/serve + components/distributed components/interpret components/metrics components/hpo - components/base - components/distributed + components/serve components/utils Runtime Library @@ -123,9 +122,9 @@ Experimental --------------- .. toctree:: :maxdepth: 1 - :caption: Beta Features + :caption: Experimental Features - beta + experimental/runner.config diff --git a/torchx/components/__init__.py b/torchx/components/__init__.py index 04b3a6a33..5269778cd 100644 --- a/torchx/components/__init__.py +++ b/torchx/components/__init__.py @@ -47,18 +47,21 @@ authoring your own component is as simple as writing a python function with the following rules: -1. The component function must return an ``specs.AppDef`` and the return type must be annotated -2. All arguments of the component must be type annotated and the type must be one of +1. The component function must return an ``specs.AppDef`` and the return type must be specified +2. All arguments of the component must be PEP 484 type annotated and the type must be one of #. Primitives: ``int``, ``float``, ``str``, ``bool`` #. Optional primitives: ``Optional[int]``, ``Optional[float]``, ``Optional[str]`` #. Maps of primitives: ``Dict[Primitive_key, Primitive_value]`` #. Lists of primitives: ``List[Primitive_values]`` #. Optional collections: ``Optional[List]``, ``Optional[Dict]`` #. VAR_ARG: ``*arg`` (useful when passing through arguments to the entrypoint script) -3. The function should have well defined docstring in - `google format `_. - This docstring is used by the torchx cli to autogenerate a ``--help`` string which is useful - when sharing components with others. +3. (optional) A docstring in `google format `_ + (in particular see ``function_with_pep484_type_annotations``). This docstring is purely informative + in that torchx cli uses it to autogenerate an informative ``--help`` message, which is + useful when sharing components with others. If the component does not have a docstring + the ``--help`` option will still work, but the parameters will have a canned description (see below). + Note that when running components programmatically via :py:mod:`torchx.runner`, the docstring + is not picked up by torchx at all. Below is an example component that launches DDP scripts, it is a simplified version of the :py:func:`torchx.components.dist.ddp` builtin. @@ -76,21 +79,6 @@ def ddp( nnodes: int = 1, nproc_per_node: int = 1, ) -> specs.AppDef: - \""" - DDP simplified. - - Args: - image: name of the docker image containing the script + deps - script: path of the script in the image - script_args: arguments to the script - host: machine type (one from named resources) - nnodes: number of nodes to launch - nproc_per_node: number of scripts to launch per node - - Returns: - specs.AppDef: ddp AppDef - \""" - return specs.AppDef( name=os.path.basename(script), roles=[ @@ -115,6 +103,73 @@ def ddp( ] ) +Assuming the component above is saved in ``example.py``, we can run ``--help`` +on it as: + +.. code-block:: shell-session + + $ torchx ./example.py:ddp --help + usage: torchx run ...torchx_params... ddp [-h] --image IMAGE --script SCRIPT [--host HOST] + [--nnodes NNODES] [--nproc_per_node NPROC_PER_NODE] + ... + + AppDef: ddp. TIP: improve this help string by adding a docstring ...... + + positional arguments: + script_args (required) + + optional arguments: + -h, --help show this help message and exit + --image IMAGE (required) + --script SCRIPT (required) + --host HOST (default: aws_p3.2xlarge) + --nnodes NNODES (default: 1) + --nproc_per_node NPROC_PER_NODE + (default: 1) + +If we include a docstring as such: + +.. code-block:: python + + def ddp(...) -> specs.AppDef: + \""" + DDP Simplified. + + Args: + image: name of the docker image containing the script + deps + script: path of the script in the image + script_args: arguments to the script + host: machine type (one from named resources) + nnodes: number of nodes to launch + nproc_per_node: number of scripts to launch per node + + \""" + + # ... component body same as above ... + pass + +Then the ``--help`` message would reflect the function and parameter descriptions +in the docstring as such: + +:: + + usage: torchx run ...torchx_params... ddp [-h] --image IMAGE --script SCRIPT [--host HOST] + [--nnodes NNODES] [--nproc_per_node NPROC_PER_NODE] + ... + + App spec: DDP simplified. + + positional arguments: + script_args arguments to the script + + optional arguments: + -h, --help show this help message and exit + --image IMAGE name of the docker image containing the script + deps + --script SCRIPT path of the script in the image + --host HOST machine type (one from named resources) + --nnodes NNODES number of nodes to launch + --nproc_per_node NPROC_PER_NODE + number of scripts to launch per node Validating diff --git a/torchx/components/base/__init__.py b/torchx/components/base/__init__.py index 94e3e107d..ca681f0bf 100644 --- a/torchx/components/base/__init__.py +++ b/torchx/components/base/__init__.py @@ -40,6 +40,12 @@ def torch_dist_role( **launch_kwargs: Any, ) -> Role: """ + .. warning:: This method is deprecated and will be removed in future versions. + Instead use :py:func:`torchx.components.dist.ddp` as a builtin, + or prefer to use `torch.distributed.run `_ + directly by setting your AppDef's ``entrypoint = python`` and + ``args = ["-m", "torch.distributed.run", ...]``. + A ``Role`` for which the user provided ``entrypoint`` is executed with the torchelastic agent (in the container). Note that the torchelastic agent invokes multiple copies of ``entrypoint``. @@ -54,8 +60,8 @@ def torch_dist_role( :: - # nproc_per_node correspond to the ``torch.distributed.launch`` arguments. More - # info about available arguments: https://pytorch.org/docs/stable/distributed.html#launch-utility + # nproc_per_node correspond to the ``torch.distributed.run`` arguments. More + # info about available arguments: https://pytorch.org/docs/stable/elastic/run.html trainer = torch_dist_role("trainer",container, entrypoint="trainer.py",.., nproc_per_node=4) diff --git a/torchx/components/base/roles.py b/torchx/components/base/roles.py index 36c48f3f1..8923f0d80 100644 --- a/torchx/components/base/roles.py +++ b/torchx/components/base/roles.py @@ -28,6 +28,12 @@ def create_torch_dist_role( **launch_kwargs: Any, ) -> Role: """ + .. warning:: This method is deprecated and will be removed in future versions. + Instead use :py:func:`torchx.components.dist.ddp` as a builtin, + or prefer to use `torch.distributed.run `_ + directly by setting your AppDef's ``entrypoint = python`` and + ``args = ["-m", "torch.distributed.run", ...]``. + A ``Role`` for which the user provided ``entrypoint`` is executed with the torchelastic agent (in the container). Note that the torchelastic agent invokes multiple copies of ``entrypoint``. diff --git a/torchx/runner/api.py b/torchx/runner/api.py index 8e196a048..4bca8b960 100644 --- a/torchx/runner/api.py +++ b/torchx/runner/api.py @@ -145,8 +145,8 @@ def run_component( if it dryrun specified. Raises: - `ComponentValidationException`: if component is invalid. - `ComponentNotFoundException`: if the ``component_path`` is failed to resolve. + ComponentValidationException: if component is invalid. + ComponentNotFoundException: if the ``component_path`` is failed to resolve. """ component_def = get_component(component_name) app = from_function(component_def.fn, app_args) @@ -505,9 +505,9 @@ def _scheduler_app_id( is the same as this session. Raises: - ValueError - if ``check_session=True`` and the session in the app handle + ValueError: if ``check_session=True`` and the session in the app handle does not match this session's name - KeyError - if no such scheduler backend exists + KeyError: if no such scheduler backend exists """ scheduler_backend, _, app_id = parse_app_handle(app_handle) diff --git a/torchx/runner/config.py b/torchx/runner/config.py index b02a2af63..e192f174c 100644 --- a/torchx/runner/config.py +++ b/torchx/runner/config.py @@ -5,6 +5,95 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +""" +You can store the scheduler :py:class:`torchx.specs.RunConfig` for your project +by storing them in the ``.torchxconfig`` file. Currently this file is only read +and honored when running the component from the CLI. + +CLI Usage +~~~~~~~~~~~ + +#. ``cd`` into the directory where you want the ``.torchxconfig`` file to be dropped. + The CLI only picks up ``.torchxconfig`` files from the current-working-directory (CWD) + so chose a directory where you typically run ``torchx`` from. Typically this + is the root of your project directory. + +#. Generate the config file by running + + .. code-block:: shell-session + + $ torchx configure -s + + # -- or for all registered schedulers -- + $ torchx configure + +#. If you specified ``-s local_cwd,kubernetes``, you should see a ``.torchxconfig`` + file as shown below: + + .. code-block:: shell-session + + $ cat .torchxconfig + [local_cwd] + + [kubernetes] + queue = #FIXME:(str) Volcano queue to schedule job in + +#. ``.torchxconfig`` in in INI format and the section names map to the scheduler names. + Each section contains the run configs for the scheduler as ``$key = $value`` pairs. + You may find that certain schedulers have empty sections, this means that + the scheduler defines sensible defaults for all its run configs hence no run configs + are required at runtime. If you'd like to override the default you can add them. + **TIP:** To see all the run options for a scheduler use ``torchx runopts ``. + +#. The sections with ``FIXME`` placeholders are run configs that are required + by the scheduler. Replace these with the values that apply to you. + +#. **IMPORTANT:** If you are happy with the scheduler provided defaults for a particular + run config, you **should not** redundantly specity them in ``.torchxconfig`` with the + same default value. This is because the scheduler may decide to change the default + value at a later date which would leave you with a stale default. + +#. Now you can run your component without having to specify the scheduler run configs + each time. Just make sure the directory you are running ``torchx`` cli from actually + has ``.torchxconfig``! + + .. code-block:: shell-session + + $ ls .torchxconfig + .torchxconfig + + $ torchx run -s local_cwd ./my_component.py:train + +Programmatic Usage +~~~~~~~~~~~~~~~~~~~ + +Unlike the cli, ``.torchxconfig`` file **is not** picked up automatically +from ``CWD`` if you are programmatically running your component with :py:class:`torchx.runner.Runner`. +You'll have to manually specify the directory containing ``.torchxconfig``. + +Below is an example + +.. doctest:: [runner_config_example] + + from torchx.runner import get_runner + from torchx.runner.config import apply + import torchx.specs as specs + + def my_component(a: int) -> specs.AppDef: + # <... component body omitted for brevity ...> + pass + + scheduler = "local_cwd" + cfg = specs.RunConfig() + cfg.set("log_dir", "/these/take/outmost/precedence") + + apply(scheduler, cfg, dirs=["/home/bob"]) # looks for /home/bob/.torchxconfig + get_runner().run(my_component(1), scheduler, cfg) + +You may also specify multiple directories (in preceding order) which is useful when +you want to keep personal config overrides on top of a project defined default. + +""" import configparser as configparser import logging from pathlib import Path @@ -66,7 +155,7 @@ def dump( To only dump required runopts pass ``required_only=True``. Each scheduler's runopts are written in the section called - ``[default.{scheduler_name}.cfg]``. + ``[{scheduler_name}]``. For example: @@ -77,7 +166,7 @@ def dump( queue = #FIXME (str)Volcano queue to schedule job in Raises: - ``ValueError`` - if given a scheduler name that is not known + ValueError: if given a scheduler name that is not known """ if schedulers: @@ -128,7 +217,7 @@ def apply(scheduler: str, cfg: RunConfig, dirs: Optional[List[str]] = None) -> N over the ones in the config file and only new configs are added. The same holds true for the configs loaded in list order. - For instance if ``cfg = {"foo": "bar"}`` and the config file is: + For instance if ``cfg={"foo":"bar"}`` and the config file is: :: @@ -137,12 +226,12 @@ def apply(scheduler: str, cfg: RunConfig, dirs: Optional[List[str]] = None) -> N foo = baz hello = world - # dir_2/.torchxconfig - [local_cwd] - hello = bob + # dir_2/.torchxconfig + [local_cwd] + hello = bob - Then after the method call, ``cfg = {"foo": "bar", "hello": "world"}``. + Then after the method call, ``cfg={"foo":"bar","hello":"world"}``. """ if not dirs: diff --git a/torchx/schedulers/api.py b/torchx/schedulers/api.py index c09a921a6..c6756ff3d 100644 --- a/torchx/schedulers/api.py +++ b/torchx/schedulers/api.py @@ -248,7 +248,7 @@ def log_iter( An ``Iterator`` over log lines of the specified role replica Raises: - NotImplementedError - if the scheduler does not support log iteration + NotImplementedError: if the scheduler does not support log iteration """ raise NotImplementedError( f"{self.__class__.__qualname__} does not support application log iteration" diff --git a/torchx/schedulers/local_scheduler.py b/torchx/schedulers/local_scheduler.py index 0f474d94c..100492e25 100644 --- a/torchx/schedulers/local_scheduler.py +++ b/torchx/schedulers/local_scheduler.py @@ -161,8 +161,8 @@ def __init__(self, cfg: RunConfig) -> None: def fetch(self, image: str) -> str: """ Raises: - ValueError - if the image name is not an absolute dir - and if it does not exist or is not a directory + ValueError: if the image name is not an absolute dir and if it + does not exist or is not a directory """ if not os.path.isdir(image):