Skip to content

Commit

Permalink
add new self-hosted CI runner
Browse files Browse the repository at this point in the history
  • Loading branch information
SolenoidWGT committed Jan 17, 2023
1 parent fdd1bb9 commit c06288e
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 20 deletions.
49 changes: 48 additions & 1 deletion .github/workflows/unit_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,58 @@ jobs:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
env:
AGENT_TOOLSDIRECTORY: /opt/hostedtoolcache
with:
python-version: ${{ matrix.python-version }}
- name: do_benchmark
run: |
python -m pip install .
python -m pip install ".[test,k8s]"
./ding/scripts/install-k8s-tools.sh
make benchmark
make benchmark
test_multiprocess:
runs-on: self-hosted
if: "!contains(github.event.head_commit.message, 'ci skip')"
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: do_multiprocesstest
timeout-minutes: 40
run: |
python -m pip install box2d-py
python -m pip install .
python -m pip install ".[test,k8s]"
./ding/scripts/install-k8s-tools.sh
make multiprocesstest
test_cuda:
runs-on: self-hosted
if: "!contains(github.event.head_commit.message, 'ci skip')"
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
env:
AGENT_TOOLSDIRECTORY: /opt/hostedtoolcache
with:
python-version: ${{ matrix.python-version }}
- name: do_unittest
timeout-minutes: 40
run: |
python -m pip install torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
python -m pip install box2d-py
python -m pip install .
python -m pip install ".[test,k8s]"
./ding/scripts/install-k8s-tools.sh
make cudatest
11 changes: 10 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,20 @@ benchmark:
--durations=0 \
-sv -m benchmark

multiprocesstest:
pytest ${TEST_DIR} \
--cov-report=xml \
--cov-report term-missing \
--cov=${COV_DIR} \
${DURATIONS_COMMAND} \
${WORKERS_COMMAND} \
-sv -m multiprocesstest

test: unittest # just for compatibility, can be changed later

cpu_test: unittest algotest benchmark

all_test: unittest algotest cudatest benchmark
all_test: unittest algotest cudatest benchmark multiprocesstest

format:
yapf --in-place --recursive -p --verbose --style .style.yapf ${FORMAT_DIR}
Expand Down
3 changes: 2 additions & 1 deletion ding/framework/message_queue/perfs/tests/test_perf_nng.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@


@pytest.mark.benchmark
# @pytest.mark.multiprocesstest
@pytest.mark.multiprocesstest
@pytest.mark.cudatest
def test_nng():
if torch.cuda.is_available() and torch.cuda.device_count() >= 2:
address = socket.gethostbyname(socket.gethostname())
Expand Down
4 changes: 2 additions & 2 deletions ding/framework/message_queue/perfs/tests/test_perf_shm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@

@pytest.mark.mqbenchmark
@pytest.mark.cudatest
# @pytest.mark.multiprocesstest
@pytest.mark.multiprocesstest
def test_shm_numpy_shm():
if torch.cuda.is_available():
shm_perf_main("shm")


@pytest.mark.mqbenchmark
@pytest.mark.cudatest
# @pytest.mark.multiprocesstest
@pytest.mark.multiprocesstest
def test_shm_cuda_shared_tensor():
if torch.cuda.is_available() and torch.cuda.device_count() >= 2:
shm_perf_main("cuda_ipc")
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

@pytest.mark.benchmark
@pytest.mark.cudatest
# @pytest.mark.multiprocesstest
@pytest.mark.multiprocesstest
def test_perf_torchrpc_nccl():
address = socket.gethostbyname(socket.gethostname())
init_method = "tcp://{}:{}".format(address, find_free_port(address))
Expand Down
23 changes: 9 additions & 14 deletions ding/framework/message_queue/tests/test_torch_rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from torch.distributed import rpc
from multiprocessing import Pool, get_context
from ding.compatibility import torch_ge_1121
from ditk import logging
from ding.utils.system_helper import find_free_port

mq = None
Expand All @@ -26,7 +25,6 @@ def torchrpc(rank):
mq = None
address = socket.gethostbyname(socket.gethostname())
recv_tensor_list = [None, None, None, None]
logging.getLogger().setLevel(logging.DEBUG)
name_list = ["A", "B", "C", "D"]

if rank == 0:
Expand Down Expand Up @@ -85,7 +83,6 @@ def torchrpc_cuda(rank):
recv_tensor_list = [None, None, None, None]
name_list = ["A", "B"]
address = socket.gethostbyname(socket.gethostname())
logging.getLogger().setLevel(logging.DEBUG)

if rank == 0:
attach_to = name_list[1:]
Expand All @@ -95,7 +92,7 @@ def torchrpc_cuda(rank):
peer_rank = int(rank == 0) or 0
peer_name = name_list[peer_rank]
device_map = DeviceMap(rank, [peer_name], [rank], [peer_rank])
logging.debug(device_map)
print(device_map)

mq = TORCHRPCMQ(
rpc_name=name_list[rank],
Expand Down Expand Up @@ -132,7 +129,6 @@ def torchrpc_args_parser(rank):
global mq
global recv_tensor_list
from ding.framework.parallel import Parallel
logging.getLogger().setLevel(logging.DEBUG)

params = Parallel._torchrpc_args_parser(
n_parallel_workers=1,
Expand All @@ -143,30 +139,30 @@ def torchrpc_args_parser(rank):
local_cuda_devices=None,
cuda_device_map=None
)[0]

logging.debug(params)
print(params)

# 1. If attach_to is empty, init_rpc will not block.
mq = TORCHRPCMQ(**params)
mq.listen()
assert mq._running
mq.stop()
assert not mq._running
logging.debug("[Pass] 1. If attach_to is empty, init_rpc will not block.")
print("[Pass] 1. If attach_to is empty, init_rpc will not block.")

# 2. n_parallel_workers != len(node_ids)
try:
Parallel._torchrpc_args_parser(n_parallel_workers=999, attach_to=[], node_ids=[1, 2])[0]
except RuntimeError as e:
logging.debug("[Pass] 2. n_parallel_workers != len(node_ids).")
print("[Pass] 2. n_parallel_workers != len(node_ids).")
pass
else:
assert False

# 3. len(local_cuda_devices) != n_parallel_workers
try:
Parallel._torchrpc_args_parser(n_parallel_workers=8, node_ids=[1], local_cuda_devices=[1, 2, 3])[0]
except RuntimeError as e:
logging.debug("[Pass] 3. len(local_cuda_devices) != n_parallel_workers.")
print("[Pass] 3. len(local_cuda_devices) != n_parallel_workers.")
else:
assert False

Expand All @@ -175,7 +171,7 @@ def torchrpc_args_parser(rank):
try:
Parallel._torchrpc_args_parser(n_parallel_workers=999, node_ids=[1], use_cuda=True)[0]
except RuntimeError as e:
logging.debug("[Pass] 4. n_parallel_workers > gpu_nums.")
print("[Pass] 4. n_parallel_workers > gpu_nums.")
else:
assert False

Expand All @@ -186,8 +182,7 @@ def torchrpc_args_parser(rank):
assert params['device_maps'].peer_name_list == ["Node_0", "Node_0", "Node_1"]
assert params['device_maps'].our_device_list == [0, 1, 1]
assert params['device_maps'].peer_device_list == [0, 2, 4]
# logging.debug(params['device_maps'])
logging.debug("[Pass] 5. Set custom device map.")
print("[Pass] 5. Set custom device map.")

# 6. Set n_parallel_workers > 1
params = Parallel._torchrpc_args_parser(n_parallel_workers=8, node_ids=[1])
Expand All @@ -201,7 +196,7 @@ def torchrpc_args_parser(rank):
params = Parallel._torchrpc_args_parser(n_parallel_workers=2, node_ids=[1], use_cuda=True)
assert params[0]['use_cuda']
assert len(params[0]['device_maps'].peer_name_list) == DEFAULT_DEVICE_MAP_NUMS - 1
logging.debug("[Pass] 6. Set n_parallel_workers > 1.")
print("[Pass] 6. Set n_parallel_workers > 1.")


@pytest.mark.unittest
Expand Down
2 changes: 2 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,7 @@ markers =
envpooltest
other
tmp
multiprocesstest
mqbenchmark

norecursedirs = ding/hpc_rl/tests

0 comments on commit c06288e

Please sign in to comment.