Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 19 additions & 10 deletions .github/workflows/components-integration-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ on:
jobs:
components-launch:
runs-on: ubuntu-18.04
permissions:
id-token: write
contents: read
steps:
- name: Setup Python
uses: actions/setup-python@v2
Expand All @@ -17,22 +20,30 @@ jobs:
architecture: x64
- name: Checkout TorchX
uses: actions/checkout@v2
- name: Configure Kube Config
- name: Configure AWS
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }}
run: |
if [ -n "$AWS_ROLE_ARN" ]; then
export AWS_WEB_IDENTITY_TOKEN_FILE=/tmp/awscreds
export AWS_DEFAULT_REGION=us-west-2

echo AWS_WEB_IDENTITY_TOKEN_FILE=$AWS_WEB_IDENTITY_TOKEN_FILE >> $GITHUB_ENV
echo AWS_ROLE_ARN=$AWS_ROLE_ARN >> $GITHUB_ENV
echo AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION >> $GITHUB_ENV

curl -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" "$ACTIONS_ID_TOKEN_REQUEST_URL" | jq -r '.value' > $AWS_WEB_IDENTITY_TOKEN_FILE
fi
- name: Configure Kube Config
run: |
set -eux
if [ -n "$AWS_ACCESS_KEY_ID" ]; then
if [ -n "$AWS_ROLE_ARN" ]; then
aws eks update-kubeconfig --region=us-west-2 --name=${{ secrets.EKS_CLUSTER_NAME }}
fi
- name: Configure Docker
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |
set -eux
if [ -n "$AWS_ACCESS_KEY_ID" ]; then
if [ -n "$AWS_ROLE_ARN" ]; then
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 495572122715.dkr.ecr.us-west-2.amazonaws.com
fi
- name: Install dependencies
Expand All @@ -42,8 +53,6 @@ jobs:
pip install -e .[kubernetes]
- name: Run Components Integration Tests
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
INTEGRATION_TEST_STORAGE: ${{ secrets.INTEGRATION_TEST_STORAGE }}
CONTAINER_REPO: ${{ secrets.CONTAINER_REPO }}
run: scripts/component_integration_tests.py
4 changes: 4 additions & 0 deletions .github/workflows/doc-build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ jobs:
set -ex
git config --global user.email "runner@github.com"
git config --global user.name "TorchX CI Runner"
- name: Install Dependencies
run: |
set -eux
sudo apt-get install -y pandoc
- name: Build
run: |
set -ex
Expand Down
29 changes: 19 additions & 10 deletions .github/workflows/kfp-integration-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ on:
jobs:
kfp-launch:
runs-on: ubuntu-18.04
permissions:
id-token: write
contents: read
steps:
- name: Install kubectl
# More info: https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/
Expand All @@ -18,13 +21,24 @@ jobs:
mkdir -p ~/.local/bin/kubectl
mv ./kubectl ~/.local/bin/kubectl
export PATH=$PATH:~/.local/bin/kubectl
- name: Configure Kube Config
- name: Configure AWS
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }}
run: |
if [ -n "$AWS_ROLE_ARN" ]; then
export AWS_WEB_IDENTITY_TOKEN_FILE=/tmp/awscreds
export AWS_DEFAULT_REGION=us-west-2

echo AWS_WEB_IDENTITY_TOKEN_FILE=$AWS_WEB_IDENTITY_TOKEN_FILE >> $GITHUB_ENV
echo AWS_ROLE_ARN=$AWS_ROLE_ARN >> $GITHUB_ENV
echo AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION >> $GITHUB_ENV

curl -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" "$ACTIONS_ID_TOKEN_REQUEST_URL" | jq -r '.value' > $AWS_WEB_IDENTITY_TOKEN_FILE
fi
- name: Configure Kube Config
run: |
set -eux
if [ -n "$AWS_ACCESS_KEY_ID" ]; then
if [ -n "$AWS_ROLE_ARN" ]; then
aws eks update-kubeconfig --region=us-west-2 --name=${{ secrets.EKS_CLUSTER_NAME }}
fi
- name: Setup Python
Expand All @@ -35,12 +49,9 @@ jobs:
- name: Checkout TorchX
uses: actions/checkout@v2
- name: Configure Docker
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |
set -eux
if [ -n "$AWS_ACCESS_KEY_ID" ]; then
if [ -n "$AWS_ROLE_ARN" ]; then
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 495572122715.dkr.ecr.us-west-2.amazonaws.com
fi
- name: Install dependencies
Expand All @@ -50,8 +61,6 @@ jobs:
python setup.py install
- name: Run KFP Integration Tests
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
KFP_NAMESPACE: ${{ secrets.KFP_NAMESPACE }}
INTEGRATION_TEST_STORAGE: ${{ secrets.INTEGRATION_TEST_STORAGE }}
CONTAINER_REPO: ${{ secrets.CONTAINER_REPO }}
Expand Down
31 changes: 20 additions & 11 deletions .github/workflows/kubernetes-dist-train-integration-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ on:
jobs:
kubernetes-launch:
runs-on: ubuntu-18.04
permissions:
id-token: write
contents: read
steps:
- name: Setup Python
uses: actions/setup-python@v2
Expand All @@ -17,22 +20,30 @@ jobs:
architecture: x64
- name: Checkout TorchX
uses: actions/checkout@v2
- name: Configure Kube Config
- name: Configure AWS
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }}
run: |
if [ -n "$AWS_ROLE_ARN" ]; then
export AWS_WEB_IDENTITY_TOKEN_FILE=/tmp/awscreds
export AWS_DEFAULT_REGION=us-west-2

echo AWS_WEB_IDENTITY_TOKEN_FILE=$AWS_WEB_IDENTITY_TOKEN_FILE >> $GITHUB_ENV
echo AWS_ROLE_ARN=$AWS_ROLE_ARN >> $GITHUB_ENV
echo AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION >> $GITHUB_ENV

curl -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" "$ACTIONS_ID_TOKEN_REQUEST_URL" | jq -r '.value' > $AWS_WEB_IDENTITY_TOKEN_FILE
fi
- name: Configure Kube Config
run: |
set -eux
if [ -n "$AWS_ACCESS_KEY_ID" ]; then
if [ -n "$AWS_ROLE_ARN" ]; then
aws eks update-kubeconfig --region=us-west-2 --name=${{ secrets.EKS_CLUSTER_NAME }}
fi
- name: Configure Docker
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |
set -eux
if [ -n "$AWS_ACCESS_KEY_ID" ]; then
if [ -n "$AWS_ROLE_ARN" ]; then
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 495572122715.dkr.ecr.us-west-2.amazonaws.com
fi
- name: Install dependencies
Expand All @@ -41,12 +52,10 @@ jobs:
pip install -e .[kubernetes]
- name: Run Kubernetes Integration Tests
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
INTEGRATION_TEST_STORAGE: ${{ secrets.INTEGRATION_TEST_STORAGE }}
CONTAINER_REPO: ${{ secrets.CONTAINER_REPO }}
run: |
if [ -z "$AWS_ACCESS_KEY_ID" ]; then
if [ -z "$AWS_ROLE_ARN" ]; then
# only dryrun if no secrets
ARGS="--dryrun"
else
Expand Down
26 changes: 20 additions & 6 deletions .github/workflows/slurm-integration-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ on:
jobs:
slurm:
runs-on: ubuntu-18.04
permissions:
id-token: write
contents: read
steps:
- name: Setup Python
uses: actions/setup-python@v2
Expand All @@ -17,21 +20,32 @@ jobs:
architecture: x64
- name: Checkout TorchX
uses: actions/checkout@v2
- name: Configure AWS
env:
AWS_ROLE_ARN: ${{ secrets.AWS_ROLE_ARN }}
run: |
if [ -n "$AWS_ROLE_ARN" ]; then
export AWS_WEB_IDENTITY_TOKEN_FILE=/tmp/awscreds
export AWS_DEFAULT_REGION=us-west-2

echo AWS_WEB_IDENTITY_TOKEN_FILE=$AWS_WEB_IDENTITY_TOKEN_FILE >> $GITHUB_ENV
echo AWS_ROLE_ARN=$AWS_ROLE_ARN >> $GITHUB_ENV
echo AWS_DEFAULT_REGION=$AWS_DEFAULT_REGION >> $GITHUB_ENV

curl -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" "$ACTIONS_ID_TOKEN_REQUEST_URL" | jq -r '.value' > $AWS_WEB_IDENTITY_TOKEN_FILE
fi
- name: Install Dependencies
run:
set -ex

pip install wheel
pip install wheel ec2instanceconnectcli
- name: Run Slurm Integration Tests
env:
SLURM_SSH: ${{ secrets.SLURM_SSH }}
SLURM_MASTER: ${{ secrets.SLURM_MASTER }}
SLURM_INSTANCE_MASTER: ${{ secrets.SLURM_INSTANCE_MASTER }}
SLURM_KNOWN_HOST: ${{ secrets.SLURM_KNOWN_HOST }}
SLURM_IDENT: id_rsa
run: |
set -e
echo "$SLURM_SSH" > "$SLURM_IDENT"
chmod 600 "$SLURM_IDENT"

mkdir -p ~/.ssh
echo "$SLURM_KNOWN_HOST" >> ~/.ssh/known_hosts

Expand Down
8 changes: 4 additions & 4 deletions scripts/slurmint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ python setup.py bdist_wheel

WHEEL="$DIST/$(ls $DIST)"

if [[ -z "${SLURM_MASTER}" ]]; then
echo "slurm master is not set, skipping test..."
if [[ -z "${SLURM_INSTANCE_MASTER}" ]]; then
echo "SLURM_INSTANCE_MASTER is not set, skipping test..."
exit 0
fi

Expand All @@ -25,11 +25,11 @@ VENV="$DIR/venv"

function run_cmd {
# shellcheck disable=SC2048,SC2086
ssh -o ServerAliveInterval=60 "$SLURM_MASTER" -i "$SLURM_IDENT" $*
mssh -o ServerAliveInterval=60 "$SLURM_INSTANCE_MASTER" -- $*
}

function run_scp {
scp -i "$SLURM_IDENT" "$1" "$SLURM_MASTER:$2"
rsync -rav -e mssh "$1" "$SLURM_INSTANCE_MASTER:$2"
}

function cleanup {
Expand Down
37 changes: 20 additions & 17 deletions torchx/cli/cmd_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,18 @@
import sys
from dataclasses import asdict
from pprint import pformat
from typing import Dict, List, cast, Type, Optional
from typing import Dict, List, Optional, Type, cast

import torchx.specs as specs
from pyre_extensions import none_throws
from torchx.cli.cmd_base import SubCommand
from torchx.runner import Runner, get_runner
from torchx.schedulers import get_scheduler_factories, get_default_scheduler_name
from torchx.runner import Runner, config, get_runner
from torchx.schedulers import get_default_scheduler_name, get_scheduler_factories
from torchx.specs.finder import (
ComponentNotFoundException,
ComponentValidationException,
_Component,
get_components,
ComponentValidationException,
ComponentNotFoundException,
)
from torchx.util.types import to_dict

Expand All @@ -41,18 +41,19 @@ def _convert_to_option_type(
return option_type(value)


def _parse_run_config(arg: str, scheduler_run_opts: specs.runopts) -> specs.RunConfig:
def _parse_run_config(arg: str, scheduler_opts: specs.runopts) -> specs.RunConfig:
conf = specs.RunConfig()
if not arg:
return conf

for key, value in to_dict(arg).items():
option = scheduler_run_opts.get(key)
option = scheduler_opts.get(key)
if option is None:
raise ValueError(f"Unknown {key}, run `torchx runopts` for more info")
option_type = option.opt_type
typed_value = _convert_to_option_type(value, option_type)
conf.set(key, typed_value)

return conf


Expand All @@ -66,9 +67,9 @@ def _builtins(self) -> Dict[str, _Component]:
def run(self, args: argparse.Namespace) -> None:
builtin_components = self._builtins()
num_builtins = len(builtin_components)
logger.info(f"Found {num_builtins} builtin configs:")
print(f"Found {num_builtins} builtin configs:")
for i, component in enumerate(builtin_components.values()):
logger.info(f" {i + 1:2d}. {component.name}")
print(f" {i + 1:2d}. {component.name}")


class CmdRun(SubCommand):
Expand All @@ -86,7 +87,7 @@ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
default=get_default_scheduler_name(),
)
subparser.add_argument(
"-a",
"-cfg",
"--scheduler_args",
type=str,
help="Arguments to pass to the scheduler (Ex:`cluster=foo,user=bar`)."
Expand Down Expand Up @@ -114,7 +115,9 @@ def add_arguments(self, subparser: argparse.ArgumentParser) -> None:
def _run(self, runner: Runner, args: argparse.Namespace) -> Optional[str]:
run_opts = get_runner().run_opts()
scheduler_opts = run_opts[args.scheduler]
scheduler_args = _parse_run_config(args.scheduler_args, scheduler_opts)
cfg = _parse_run_config(args.scheduler_args, scheduler_opts)
config.apply(scheduler=args.scheduler, cfg=cfg)

if len(args.conf_args) < 1:
none_throws(self._subparser).error(
"the following arguments are required: conf_file, conf_args"
Expand All @@ -129,7 +132,7 @@ def _run(self, runner: Runner, args: argparse.Namespace) -> Optional[str]:
conf_file,
conf_args,
args.scheduler,
scheduler_args,
cfg,
dryrun=args.dryrun,
)
except (ComponentValidationException, ComponentNotFoundException) as e:
Expand All @@ -139,11 +142,12 @@ def _run(self, runner: Runner, args: argparse.Namespace) -> Optional[str]:

if args.dryrun:
app_dryrun_info = cast(specs.AppDryRunInfo, result)
logger.info("=== APPLICATION ===")
logger.info(pformat(asdict(app_dryrun_info._app), indent=2, width=80))
logger.info(
"\n=== APPLICATION ===\n"
f"{pformat(asdict(app_dryrun_info._app), indent=2, width=80)}"
)

logger.info("=== SCHEDULER REQUEST ===")
logger.info(app_dryrun_info)
logger.info("\n=== SCHEDULER REQUEST ===\n" f"{app_dryrun_info}")
return
else:
app_handle = cast(specs.AppHandle, result)
Expand All @@ -153,7 +157,6 @@ def _run(self, runner: Runner, args: argparse.Namespace) -> Optional[str]:
if args.scheduler.startswith("local"):
self._wait_and_exit(runner, app_handle)
else:
logger.info("=== RUN RESULT ===")
logger.info(f"Launched app: {app_handle}")
status = runner.status(app_handle)
logger.info(status)
Expand Down
2 changes: 1 addition & 1 deletion torchx/components/interpret.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@
See the
:ref:`examples_apps/lightning_classy_vision/interpret:Model Interpretability App Example`
and the corresponding
:ref:`Interpret component definition<examples_apps/lightning_classy_vision/component:Trainer Component Examples>`
:ref:`Interpret component definition<examples_apps/lightning_classy_vision/component:Interpreting the Model>`
for an example of how to use Captum with TorchX.
"""
Loading