In [1]:
import os, sys
import cdsw

In [3]:
NUM_WORKERS = 2
CMD = '''mpirun -np {} -H {} -bind-to none -map-by slot \
-mca pml ob1 -mca btl ^openib -mca plm_rsh_args '-p 2222 -o StrictHostKeyChecking=no' -mca btl_tcp_if_include eth0 \
-x NCCL_SOCKET_IFNAME=eth0 -x COLUMNS -x LINES \
python3 dist-torch-hvd.py'''

workers = cdsw.launch_workers(n=NUM_WORKERS, cpu=1, memory=4, nvidia_gpu=0,
                              code="import time; time.sleep(365*24*3600)")
print('Starting workers ...')
worker_ids = [worker["id"] for worker in workers]
running_workers = cdsw.await_workers(worker_ids,
                              wait_for_completion=False,
                              timeout_seconds=120)
worker_ips = [worker["ip_address"] for worker in \
                              running_workers["workers"]]
print('Workers:', worker_ips)

if len(running_workers) == NUM_WORKERS:
    hosts_str = ",".join([worker_ip+":1" for worker_ip in worker_ips])
    cmd = CMD.format(len(worker_ips), hosts_str)
    cmd = "horovodrun -np {} -H {} -p 2222 python3 dist-torch-hvd.py 2>&1".format(
                                  len(worker_ips),
                                  hosts_str)
    print('Preparing to run: ' + cmd)
    os.system(cmd)
    print('DONE')
    #cdsw.stop_workers()
else:
    print('Errors when starting workers. Exits.')
    sys.exit(1)

Starting workers ...
Workers: ['100.100.7.177', '100.100.7.183']
Preparing to run: horovodrun -np 2 -H 100.100.7.177:1,100.100.7.183:1 -p 2222 python3 dist-torch-hvd.py 2>&1
DONE


In [4]:
cdsw.list_workers()

[{'id': 'b49kyjsauudysw8p',
  'name': 'Untitled Worker',
  'project': {'id': 57,
   'name': 'Distributed PyTorch with Horovod',
   'slug': 'pauldefusco/distributed-pytorch-with-horovod',
   'html_url': 'https://ml-4d51f976-b26.go02-dem.ylcu-atmi.cloudera.site/pauldefusco/distributed-pytorch-with-horovod',
   'url': 'https://ml-4d51f976-b26.go02-dem.ylcu-atmi.cloudera.site/api/v1/projects/pauldefusco/distributed-pytorch-with-horovod',
   'default_engine_type': 'ml_runtime'},
  'owner': {'id': 4,
   'username': 'pauldefusco',
   'name': 'Paul de Fusco',
   'html_url': 'https://ml-4d51f976-b26.go02-dem.ylcu-atmi.cloudera.site/pauldefusco',
   'url': 'https://ml-4d51f976-b26.go02-dem.ylcu-atmi.cloudera.site/api/v1/users/pauldefusco',
   'is_team': False},
  'creator': {'id': 4,
   'username': 'pauldefusco',
   'name': 'Paul de Fusco',
   'html_url': 'https://ml-4d51f976-b26.go02-dem.ylcu-atmi.cloudera.site/pauldefusco',
   'url': 'https://ml-4d51f976-b26.go02-dem.ylcu-atmi.cloudera.site/ap

With horovod timeline

In [17]:
NUM_WORKERS = 2
CMD = '''mpirun -np {} -H {} -bind-to none -map-by slot \
-mca pml ob1 -mca btl ^openib -mca plm_rsh_args '-p 2222 -o StrictHostKeyChecking=no' -mca btl_tcp_if_include eth0 \
-x NCCL_SOCKET_IFNAME=eth0 -x COLUMNS -x LINES \
python3 dist-torch-hvd.py'''

workers = cdsw.launch_workers(n=NUM_WORKERS, cpu=1, memory=4, nvidia_gpu=1,
                              code="import time; time.sleep(365*24*3600)")
print('Starting workers ...')
worker_ids = [worker["id"] for worker in workers]
running_workers = cdsw.await_workers(worker_ids,
                              wait_for_completion=False,
                              timeout_seconds=120)
worker_ips = [worker["ip_address"] for worker in \
                              running_workers["workers"]]
print('Workers:', worker_ips)

if len(running_workers) == NUM_WORKERS:
    hosts_str = ",".join([worker_ip+":1" for worker_ip in worker_ips])
    cmd = CMD.format(len(worker_ips), hosts_str)
    cmd = "horovodrun -np {} --timeline-filename timeline.json -H {} -p 2222 python3 dist-torch-hvd.py 2>&1".format(
                                  len(worker_ips),
                                  hosts_str)
    print('Preparing to run: ' + cmd)
    os.system(cmd)
    print('DONE')
    cdsw.stop_workers()
else:
    print('Errors when starting workers. Exits.')
    sys.exit(1)

Starting workers ...
Workers: []
Preparing to run: horovodrun -np 0 --timeline-filename timeline.json -H  -p 2222 python3 dist-torch-hvd.py 2>&1
DONE
