meta-pytorch · kiukchung · Oct 15, 2021
diff --git a/docs/Makefile b/docs/Makefile
@@ -5,7 +5,7 @@
 #
 
 # You can set these variables from the command line.
-SPHINXOPTS    = -W
+SPHINXOPTS    = 
 SPHINXBUILD   = sphinx-build
 SPHINXPROJ    = torchx
 SOURCEDIR     = source
@@ -21,16 +21,11 @@ clean:
 	rm -rf "$(BUILDDIR)"
 	rm -rf "$(SOURCEDIR)/examples_apps" "$(SOURCEDIR)/examples_pipelines"
 
-.PHONY: help Makefile clean livehtml papermill
+.PHONY: help Makefile clean 
 
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)/$(VERSION)" $(SPHINXOPTS) $(O)
 
 # optional live version
-livehtml:
-	sphinx-autobuild --watch ../torchx --watch ../examples --re-ignore ".*(examples_.*|.new|examples/(Dockerfile|.*.py))" "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-papermill: html
-	./papermill.sh
diff --git a/docs/source/beta.rst b/docs/source/beta.rst
diff --git a/docs/source/experimental/runner.config.rst b/docs/source/experimental/runner.config.rst
@@ -0,0 +1,12 @@
+(beta) .torchxconfig file
+-----------------------------
+
+.. automodule:: torchx.runner.config
+.. currentmodule:: torchx.runner.config
+
+Config API Functions
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: apply
+.. autofunction:: load
+.. autofunction:: dump
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -81,12 +81,11 @@ Components Library
 
    components/overview
    components/train
-   components/serve
+   components/distributed
    components/interpret
    components/metrics
    components/hpo
-   components/base
-   components/distributed
+   components/serve
    components/utils
 
 Runtime Library
@@ -123,9 +122,9 @@ Experimental
 ---------------
 .. toctree::
    :maxdepth: 1
-   :caption: Beta Features
+   :caption: Experimental Features
 
-   beta
+   experimental/runner.config
 
 
 
diff --git a/torchx/components/__init__.py b/torchx/components/__init__.py
@@ -47,18 +47,21 @@
 authoring your own component is as simple as writing a python function with the following
 rules:
 
-1. The component function must return an ``specs.AppDef`` and the return type must be annotated
-2. All arguments of the component must be type annotated and the type must be one of
+1. The component function must return an ``specs.AppDef`` and the return type must be specified
+2. All arguments of the component must be PEP 484 type annotated and the type must be one of
     #. Primitives: ``int``, ``float``, ``str``, ``bool``
     #. Optional primitives: ``Optional[int]``, ``Optional[float]``, ``Optional[str]``
     #. Maps of primitives: ``Dict[Primitive_key, Primitive_value]``
     #. Lists of primitives: ``List[Primitive_values]``
     #. Optional collections: ``Optional[List]``, ``Optional[Dict]``
     #. VAR_ARG: ``*arg`` (useful when passing through arguments to the entrypoint script)
-3. The function should have well defined docstring in
-   `google format <https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html>`_.
-   This docstring is used by the torchx cli to autogenerate a ``--help`` string which is useful
-   when sharing components with others.
+3. (optional) A docstring in `google format <https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html>`_
+   (in particular see ``function_with_pep484_type_annotations``). This docstring is purely informative
+   in that torchx cli uses it to autogenerate an informative ``--help`` message, which is
+   useful when sharing components with others. If the component does not have a docstring
+   the ``--help`` option will still work, but the parameters will have a canned description (see below).
+   Note that when running components programmatically via :py:mod:`torchx.runner`, the docstring
+   is not picked up by torchx at all.
 
 Below is an example component that launches DDP scripts, it is a simplified version of
 the :py:func:`torchx.components.dist.ddp` builtin.
@@ -76,21 +79,6 @@ def ddp(
      nnodes: int = 1,
      nproc_per_node: int = 1,
  ) -> specs.AppDef:
-    \"""
-    DDP simplified.
-
-    Args:
-        image: name of the docker image containing the script + deps
-        script: path of the script in the image
-        script_args: arguments to the script
-        host: machine type (one from named resources)
-        nnodes: number of nodes to launch
-        nproc_per_node: number of scripts to launch per node
-
-    Returns:
-        specs.AppDef: ddp AppDef
-    \"""
-
     return specs.AppDef(
         name=os.path.basename(script),
         roles=[
@@ -115,6 +103,73 @@ def ddp(
         ]
     )
 
+Assuming the component above is saved in ``example.py``, we can run ``--help``
+on it as:
+
+.. code-block:: shell-session
+
+ $ torchx ./example.py:ddp --help
+ usage: torchx run ...torchx_params... ddp  [-h] --image IMAGE --script SCRIPT [--host HOST]
+                                           [--nnodes NNODES] [--nproc_per_node NPROC_PER_NODE]
+                                           ...
+
+ AppDef: ddp. TIP: improve this help string by adding a docstring ...<omitted for brevity>...
+
+ positional arguments:
+   script_args           (required)
+
+ optional arguments:
+   -h, --help            show this help message and exit
+   --image IMAGE         (required)
+   --script SCRIPT       (required)
+   --host HOST           (default: aws_p3.2xlarge)
+   --nnodes NNODES       (default: 1)
+   --nproc_per_node NPROC_PER_NODE
+                         (default: 1)
+
+If we include a docstring as such:
+
+.. code-block:: python
+
+  def ddp(...) -> specs.AppDef:
+    \"""
+    DDP Simplified.
+
+    Args:
+       image: name of the docker image containing the script + deps
+       script: path of the script in the image
+       script_args: arguments to the script
+       host: machine type (one from named resources)
+       nnodes: number of nodes to launch
+       nproc_per_node: number of scripts to launch per node
+
+    \"""
+
+    # ... component body same as above ...
+    pass
+
+Then the ``--help`` message would reflect the function and parameter descriptions
+in the docstring as such:
+
+::
+
+ usage: torchx run ...torchx_params... ddp  [-h] --image IMAGE --script SCRIPT [--host HOST]
+                                           [--nnodes NNODES] [--nproc_per_node NPROC_PER_NODE]
+                                           ...
+
+ App spec: DDP simplified.
+
+ positional arguments:
+   script_args           arguments to the script
+
+ optional arguments:
+   -h, --help            show this help message and exit
+   --image IMAGE         name of the docker image containing the script + deps
+   --script SCRIPT       path of the script in the image
+   --host HOST           machine type (one from named resources)
+   --nnodes NNODES       number of nodes to launch
+   --nproc_per_node NPROC_PER_NODE
+                         number of scripts to launch per node
 
 
 Validating

diff --git a/torchx/components/base/__init__.py b/torchx/components/base/__init__.py
@@ -40,6 +40,12 @@ def torch_dist_role(
     **launch_kwargs: Any,
 ) -> Role:
     """
+    .. warning:: This method is deprecated and will be removed in future versions.
+                 Instead use :py:func:`torchx.components.dist.ddp` as a builtin,
+                 or prefer to use `torch.distributed.run <https://pytorch.org/docs/stable/elastic/run.html>`_
+                 directly by setting your AppDef's ``entrypoint = python`` and
+                 ``args = ["-m", "torch.distributed.run", ...]``.
+
     A ``Role`` for which the user provided ``entrypoint`` is executed with the
         torchelastic agent (in the container). Note that the torchelastic agent
         invokes multiple copies of ``entrypoint``.
@@ -54,8 +60,8 @@ def torch_dist_role(
 
     ::
 
-     # nproc_per_node correspond to the ``torch.distributed.launch`` arguments. More
-     # info about available arguments: https://pytorch.org/docs/stable/distributed.html#launch-utility
+     # nproc_per_node correspond to the ``torch.distributed.run`` arguments. More
+     # info about available arguments: https://pytorch.org/docs/stable/elastic/run.html
      trainer = torch_dist_role("trainer",container, entrypoint="trainer.py",.., nproc_per_node=4)
 
 

diff --git a/torchx/components/base/roles.py b/torchx/components/base/roles.py
@@ -28,6 +28,12 @@ def create_torch_dist_role(
     **launch_kwargs: Any,
 ) -> Role:
     """
+    .. warning:: This method is deprecated and will be removed in future versions.
+                 Instead use :py:func:`torchx.components.dist.ddp` as a builtin,
+                 or prefer to use `torch.distributed.run <https://pytorch.org/docs/stable/elastic/run.html>`_
+                 directly by setting your AppDef's ``entrypoint = python`` and
+                 ``args = ["-m", "torch.distributed.run", ...]``.
+
     A ``Role`` for which the user provided ``entrypoint`` is executed with the
     torchelastic agent (in the container). Note that the torchelastic agent
     invokes multiple copies of ``entrypoint``.

diff --git a/torchx/runner/api.py b/torchx/runner/api.py
@@ -145,8 +145,8 @@ def run_component(
             if it dryrun specified.
 
         Raises:
-            `ComponentValidationException`: if component is invalid.
-            `ComponentNotFoundException`: if the ``component_path`` is failed to resolve.
+            ComponentValidationException: if component is invalid.
+            ComponentNotFoundException: if the ``component_path`` is failed to resolve.
         """
         component_def = get_component(component_name)
         app = from_function(component_def.fn, app_args)
@@ -505,9 +505,9 @@ def _scheduler_app_id(
         is the same as this session.
 
         Raises:
-            ValueError - if ``check_session=True`` and the session in the app handle
+            ValueError: if ``check_session=True`` and the session in the app handle
                          does not match this session's name
-            KeyError - if no such scheduler backend exists
+            KeyError: if no such scheduler backend exists
         """
 
         scheduler_backend, _, app_id = parse_app_handle(app_handle)

diff --git a/torchx/runner/config.py b/torchx/runner/config.py
@@ -5,6 +5,95 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+"""
+You can store the scheduler :py:class:`torchx.specs.RunConfig` for your project
+by storing them in the ``.torchxconfig`` file. Currently this file is only read
+and honored when running the component from the CLI.
+
+CLI Usage
+~~~~~~~~~~~
+
+#. ``cd`` into the directory where you want the ``.torchxconfig`` file to be dropped.
+   The CLI only picks up ``.torchxconfig`` files from the current-working-directory (CWD)
+   so chose a directory where you typically run ``torchx`` from. Typically this
+   is the root of your project directory.
+
+#. Generate the config file by running
+
+   .. code-block:: shell-session
+
+    $ torchx configure -s <comma,delimited,scheduler,names>
+
+    # -- or for all registered schedulers --
+    $ torchx configure
+
+#. If you specified ``-s local_cwd,kubernetes``, you should see a ``.torchxconfig``
+   file as shown below:
+
+   .. code-block:: shell-session
+
+    $ cat .torchxconfig
+    [local_cwd]
+
+    [kubernetes]
+    queue = #FIXME:(str) Volcano queue to schedule job in
+
+#. ``.torchxconfig`` in in INI format and the section names map to the scheduler names.
+   Each section contains the run configs for the scheduler as ``$key = $value`` pairs.
+   You may find that certain schedulers have empty sections, this means that
+   the scheduler defines sensible defaults for all its run configs hence no run configs
+   are required at runtime. If you'd like to override the default you can add them.
+   **TIP:** To see all the run options for a scheduler use ``torchx runopts <scheduler_name>``.
+
+#. The sections with ``FIXME`` placeholders are run configs that are required
+   by the scheduler. Replace these with the values that apply to you.
+
+#. **IMPORTANT:** If you are happy with the scheduler provided defaults for a particular
+   run config, you **should not** redundantly specity them in ``.torchxconfig`` with the
+   same default value. This is because the scheduler may decide to change the default
+   value at a later date which would leave you with a stale default.
+
+#. Now you can run your component without having to specify the scheduler run configs
+   each time. Just make sure the directory you are running ``torchx`` cli from actually
+   has ``.torchxconfig``!
+
+   .. code-block:: shell-session
+
+    $ ls .torchxconfig
+    .torchxconfig
+
+    $ torchx run -s local_cwd ./my_component.py:train
+
+Programmatic Usage
+~~~~~~~~~~~~~~~~~~~
+
+Unlike the cli, ``.torchxconfig`` file **is not** picked up automatically
+from ``CWD`` if you are programmatically running your component with :py:class:`torchx.runner.Runner`.
+You'll have to manually specify the directory containing ``.torchxconfig``.
+
+Below is an example
+
+.. doctest:: [runner_config_example]
+
+ from torchx.runner import get_runner
+ from torchx.runner.config import apply
+ import torchx.specs as specs
+
+ def my_component(a: int) -> specs.AppDef:
+    # <... component body omitted for brevity ...>
+    pass
+
+ scheduler = "local_cwd"
+ cfg = specs.RunConfig()
+ cfg.set("log_dir", "/these/take/outmost/precedence")
+
+ apply(scheduler, cfg, dirs=["/home/bob"])  # looks for /home/bob/.torchxconfig
+ get_runner().run(my_component(1), scheduler, cfg)
+
+You may also specify multiple directories (in preceding order) which is useful when
+you want to keep personal config overrides on top of a project defined default.
+
+"""
 import configparser as configparser
 import logging
 from pathlib import Path
@@ -66,7 +155,7 @@ def dump(
     To only dump required runopts pass ``required_only=True``.
 
     Each scheduler's runopts are written in the section called
-    ``[default.{scheduler_name}.cfg]``.
+    ``[{scheduler_name}]``.
 
     For example:
 
@@ -77,7 +166,7 @@ def dump(
      queue = #FIXME (str)Volcano queue to schedule job in
 
     Raises:
-        ``ValueError`` - if given a scheduler name that is not known
+        ValueError: if given a scheduler name that is not known
     """
 
     if schedulers:
@@ -128,7 +217,7 @@ def apply(scheduler: str, cfg: RunConfig, dirs: Optional[List[str]] = None) -> N
     over the ones in the config file and only new configs are added. The same holds
     true for the configs loaded in list order.
 
-    For instance if ``cfg = {"foo": "bar"}`` and the config file is:
+    For instance if ``cfg={"foo":"bar"}`` and the config file is:
 
     ::
 
@@ -137,12 +226,12 @@ def apply(scheduler: str, cfg: RunConfig, dirs: Optional[List[str]] = None) -> N
      foo = baz
      hello = world
 
-    # dir_2/.torchxconfig
-    [local_cwd]
-    hello = bob
+     # dir_2/.torchxconfig
+     [local_cwd]
+     hello = bob
 
 
-    Then after the method call, ``cfg = {"foo": "bar", "hello": "world"}``.
+    Then after the method call, ``cfg={"foo":"bar","hello":"world"}``.
     """
 
     if not dirs: