# WML-A Job Submission via WML-A CLI

Offical examples can be found here: https://wmla-console-cpd-wmla.apps.datascienceelite.com/ui/#/cliTools

In [None]:
%env DIR=/userfs
%env NAMESPACE=cpd-wmla
%env HOST=wmla-console-cpd-wmla.apps.cpd.mskcc.org
%env BASE_URL=https://cpd-cpd.apps.cpd.mskcc.org

In [None]:
!du -sh /userfs

In [None]:
!df -h /userfs

### Submit Jobs
#### PyTorch (single GPU or multiple GPUs on one node with multithreading)

In [None]:
%env DIR_job_submission=/userfs/job_submission

!rm -rf $DIR_job_submission
!mkdir -p $DIR_job_submission

!cp -r $DIR/deepliif $DIR_job_submission
!cp $DIR/cli.py $DIR_job_submission
!cp $DIR/train_command.py $DIR_job_submission
!cp $DIR/monitor_gpu.sh $DIR_job_submission
!cp $DIR/custom_save.py $DIR_job_submission

In [None]:
!python dlicmd.py --exec-start PyTorch --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN \
                  --msd-env USER_ACCESS_TOKEN=$USER_ACCESS_TOKEN --msd-env BASE_URL=$BASE_URL \
                  --workerDeviceNum 1 --workerMemory 8g \
                  --model-dir $DIR_job_submission --model-main train_command.py \
                  --cs-datastore-meta type=fs,data_path=DeepLIIF_Datasets_Full/

### Submit Jobs
#### distPyTorch (multiprocessing using DDP)

In [None]:
%env DIR_job_submission=/userfs/job_submission

!rm -rf $DIR_job_submission
!mkdir -p $DIR_job_submission

!cp -r $DIR/deepliif $DIR_job_submission
!cp $DIR/cli.py $DIR_job_submission
!cp $DIR/train_command.py $DIR_job_submission
!cp $DIR/monitor_gpu.sh $DIR_job_submission
!cp $DIR/custom_save.py $DIR_job_submission

In [None]:
%%writefile /userfs/conf_distPyTorch.py
import os
import torch.distributed as dist
def init_process():
    dist.init_process_group(
        backend='nccl',
        init_method='tcp://' + os.environ['MASTER_ADDR'] + ':' + os.environ['MASTER_PORT'],
        rank=int(os.environ['RANK']),
        world_size=int(os.environ['WORLD_SIZE']))
    
print('------ initiate process group... ------')
init_process()

In [None]:
# cat cannot change file in place, so we create a new one and use it to overwrite cli.py
!cat /userfs/conf_distPyTorch.py $DIR_job_submission/cli.py > $DIR_job_submission/cli_edited.py
!mv $DIR_job_submission/cli_edited.py $DIR_job_submission/cli.py

In [None]:
!python dlicmd.py --exec-start distPyTorch --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN \
                  --msd-env USER_ACCESS_TOKEN=$USER_ACCESS_TOKEN --msd-env BASE_URL=$BASE_URL \
                  --numWorker 1 --workerMemory 8g \
                  --model-dir $DIR_job_submission --model-main train_command.py \
                  --cs-datastore-meta type=fs,data_path=DeepLIIF_Datasets_Full/

Run example code:

In [None]:
# %env DIR_job_submission=/userfs/job_submission

# !rm -rf $DIR_job_submission
# !mkdir -p $DIR_job_submission

# !cp -r $DIR/wmla-tutorial/pytorch_mnist_dist.py $DIR_job_submission

In [None]:
# !python dlicmd.py --exec-start distPyTorch --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN \
#                   --msd-env USER_ACCESS_TOKEN=$USER_ACCESS_TOKEN --msd-env BASE_URL=$BASE_URL \
#                   --numWorker 1 --workerMemory 8g \
#                   --model-dir $DIR_job_submission --model-main pytorch_mnist_dist.py \

### Delete Jobs (and associated results/logs)
#### delete one job

In [None]:
# !python dlicmd.py --exec-delete $NAMESPACE-38 --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN 

#### delete multiple jobs in a loop

In [None]:
for i in range(330,331):
    %env i=$i
    !python dlicmd.py --exec-delete $NAMESPACE-$i --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN 

### Get Job Status

In [None]:
# !python dlicmd.py --exec-get $NAMESPACE-10 --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN

### Get Job Log
#### last 10 lines

In [None]:
!python dlicmd.py --exec-outlogs $NAMESPACE-341 --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN

#### full log files

In [None]:
# !python dlicmd.py --exec-trainerrlogs $NAMESPACE-134 --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN

In [None]:
# %env FN_LOG=wmla train 20220104 num_threads0,batch_size3,num_workers16,ddp msk.log
%env FN_LOG=deterministic training test.log
%env DIR_LOG=/userfs/log
# %env JOB_ID=

In [None]:
!python dlicmd.py --exec-trainoutlogs $NAMESPACE-342 --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN > $DIR_LOG/"$FN_LOG"
!cat $DIR_LOG/"$FN_LOG" | grep \(epoch:

In [None]:
!tail -n30 $DIR_LOG/"$FN_LOG"

In [None]:
!python dlicmd.py --app-outlogs $NAMESPACE-253 --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN