Skip to content

Commit

Permalink
Merge branch 'master' into fix-z3-sp-arg
Browse files Browse the repository at this point in the history
  • Loading branch information
tohtana committed Jul 24, 2024
2 parents 5d68b56 + ffd0a0e commit f560694
Show file tree
Hide file tree
Showing 285 changed files with 8,212 additions and 3,116 deletions.
6 changes: 5 additions & 1 deletion .github/workflows/amd-mi200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ name: amd-mi200

on:
workflow_dispatch:
pull_request:
paths:
- '.github/workflows/amd-mi200.yml'
- 'requirements/**'
schedule:
- cron: "0 0 * * *"

Expand All @@ -21,7 +25,7 @@ jobs:
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/cpu-inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ jobs:
unit-tests:
runs-on: [self-hosted, cpu]

env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions

steps:
- uses: actions/checkout@v3

Expand Down Expand Up @@ -97,5 +99,5 @@ jobs:
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
# LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner
LOCAL_SIZE=2 COLUMNS=240 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
LOCAL_SIZE=2 COLUMNS=240 TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
LOCAL_SIZE=2 COLUMNS=240 HF_HOME=~/tmp/hf_home/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
LOCAL_SIZE=2 COLUMNS=240 HF_HOME=~/tmp/hf_home/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
8 changes: 4 additions & 4 deletions .github/workflows/cpu-torch-latest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ concurrency:

jobs:
unit-tests:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down Expand Up @@ -50,5 +50,5 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.2"
TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.2"
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.4"
HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.4"
4 changes: 2 additions & 2 deletions .github/workflows/formatting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ jobs:

# formatting and basic install on cpu-only machine
unit-tests:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: environment
run: |
Expand Down
23 changes: 19 additions & 4 deletions .github/workflows/hpu-gaudi2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,23 @@ on:
paths:
- ".github/workflows/hpu-gaudi2.yml"
- "accelerator/hpu_accelerator.py"

- "op_builder/hpu/**"
- "deepspeed/runtime/engine.py"
- "deepspeed/runtime/bf16_optimizer.py"
- "deepspeed/runtime/zero/stage_1_and_2.py"
- "deepspeed/runtime/zero/stage3.py"
- "deepspeed/runtime/zero/partition_parameters.py"
- "deepspeed/runtime/zero/partitioned_param_coordinator.py"
- "deepspeed/runtime/zero/parameter_offload.py"
- "deepspeed/runtime/pipe/engine.py"
- "deepspeed/runtime/utils.py"
- "deepspeed/inference/engine.py"
- "deepspeed/module_inject/auto_tp.py"
- "deepspeed/module_inject/replace_module.py"
- "deepspeed/module_inject/load_checkpoint.py"
- "deepspeed/module_inject/inject.py"
- "deepspeed/ops/transformer/**"
- "deepspeed/ops/adam/**"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand All @@ -23,7 +39,7 @@ jobs:
# The type of runner that the job will run on
runs-on: [self-hosted, intel, gaudi2]
container:
image: vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
image: vault.habana.ai/gaudi-docker/1.15.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
ports:
- 80
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
Expand All @@ -36,7 +52,6 @@ jobs:
test_compression.py
test_dist.py
test_elastic.py
(test_intX_quantization.py and test_quantized_linear)
test_ds_arguments.py
test_run.py
test_multinode_runner.py
Expand Down Expand Up @@ -83,7 +98,7 @@ jobs:
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Check container state
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-a6000.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
options: --gpus all --shm-size "8G"

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Check container state
run: |
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-accelerate-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu116, v100]
runs-on: [self-hosted, nvidia, cu117, v100]

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down
9 changes: 7 additions & 2 deletions .github/workflows/nv-ds-chat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ on:
required: false
default: 'master'
type: string
pull_request:
paths:
- "deepspeed/runtime/zero/stage_1_and_2.py"
- "deepspeed/runtime/zero/stage3.py"
- "deepspeed/runtime/hybrid_engine.py"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand All @@ -21,10 +26,10 @@ permissions:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu116, v100]
runs-on: [self-hosted, nvidia, cu117, v100]

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-h100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
options: --gpus all --shm-size "8G"

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Check container state
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-human-eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
options: --gpus all --shm-size "8G"

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Check container state
run: |
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu116, v100]
runs-on: [self-hosted, nvidia, cu117, v100]

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/nv-lightning-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, v100]

env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions

steps:
- uses: actions/checkout@v3

Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/nv-mii.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu116, v100]
runs-on: [self-hosted, nvidia, cu117, v100]

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand All @@ -46,7 +46,7 @@ jobs:
git clone https://github.com/huggingface/transformers
cd transformers
# if needed switch to the last known good SHA until transformers@master is fixed
# git checkout 1cc453d33
# git checkout bdf36dc
git rev-parse --short HEAD
pip install .
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/nv-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,17 @@ permissions:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu116, v100]
runs-on: [self-hosted, nvidia, cu117, v100]

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --index-url https://download.pytorch.org/whl/cu116
pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --index-url https://download.pytorch.org/whl/cu117
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down Expand Up @@ -55,7 +55,7 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6"
pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.7"
- name: Open GitHub issue if nightly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/nv-pre-compile-ops.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ concurrency:

jobs:
unit-tests:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
container:
image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: environment
run: |
Expand All @@ -36,7 +36,7 @@ jobs:
#python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Compile DeepSpeed Ops
run: |
DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
- name: DS Report
run: |
ds_report
2 changes: 1 addition & 1 deletion .github/workflows/nv-sd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
options: --gpus all --shm-size "8G"

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Check container state
run: |
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/nv-torch-latest-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu116, v100]
runs-on: [self-hosted, nvidia, cu117, v100]

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down Expand Up @@ -55,5 +55,5 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.2" --cuda_ver="11.8"
pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.2" --cuda_ver="11.8"
pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.4" --cuda_ver="11.8"
pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.4" --cuda_ver="11.8"
4 changes: 2 additions & 2 deletions .github/workflows/nv-torch-nightly-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ permissions:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu116, v100]
runs-on: [self-hosted, nvidia, cu117, v100]

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/nv-torch110-p40.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, p40]

env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions

steps:
- uses: actions/checkout@v3

Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/nv-torch110-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, v100]

env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions

steps:
- uses: actions/checkout@v3

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-transformers-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu116, v100]
runs-on: [self-hosted, nvidia, cu117, v100]

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ jobs:
unit-tests:
strategy:
matrix:
pyVersion: ["3.6", "3.7", "3.8", "3.9", "3.10"]
pyVersion: ["3.7", "3.8", "3.9", "3.10"]
fail-fast: false

runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
container:
image: deepspeed/gh-builder:py${{ matrix.pyVersion }}

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: environment
run: |
Expand Down
Loading

0 comments on commit f560694

Please sign in to comment.