# WML-A Job Submission via WML-A CLI

Offical examples can be found here: https://wmla-console-cpd-wmla.apps.datascienceelite.com/ui/#/cliTools

In [1]:
%env DIR=/userfs
%env NAMESPACE=wmla-ns
%env HOST=wmla-console-wmla-ns.apps.datascienceelite.com
%env BASE_URL=https://daell-wmla.datascienceelite.com

env: DIR=/userfs
env: NAMESPACE=wmla-ns
env: HOST=wmla-console-wmla-ns.apps.datascienceelite.com
env: BASE_URL=https://daell-wmla.datascienceelite.com


In [2]:
!du -sh /userfs

965M	/userfs


In [3]:
!df -h /userfs

Filesystem                                                                                                                                                Size  Used Avail Use% Mounted on
172.30.160.109:6789,172.30.195.80:6789,172.30.12.225:6789:/volumes/csi/csi-vol-0ca97bb7-40ff-11ec-8633-0a580a80040f/123cc037-9b42-4ab8-a00e-6536da440743   30G   26G  4.9G  84% /userfs


### Submit Jobs
#### PyTorch (single GPU or multiple GPUs on one node with multithreading)

In [4]:
%env DIR_job_submission=/userfs/job_submission

!rm -rf $DIR_job_submission
!mkdir -p $DIR_job_submission

!cp -r $DIR/deepliif $DIR_job_submission
!cp $DIR/train.py $DIR_job_submission
!cp $DIR/train_command.py $DIR_job_submission
!cp $DIR/monitor_gpu.sh $DIR_job_submission
!cp $DIR/custom_save.py $DIR_job_submission

env: DIR_job_submission=/userfs/job_submission


In [16]:
!python dlicmd.py --exec-start PyTorch --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN \
                  --msd-env USER_ACCESS_TOKEN=$USER_ACCESS_TOKEN --msd-env BASE_URL=$BASE_URL \
                  --workerDeviceNum 1 --workerMemory 8g \
                  --model-dir $DIR_job_submission --model-main train_command.py \
                  --cs-datastore-meta type=fs,data_path=DeepLIIF_Datasets_Full/

Copying files and directories ...
Content size: 91.7K
{
  "execId": "wmla-ns-334",
  "appId": "wmla-ns-334"
}


### Submit Jobs
#### distPyTorch (multiprocessing using DDP)

In [38]:
%env DIR_job_submission=/userfs/job_submission

!rm -rf $DIR_job_submission
!mkdir -p $DIR_job_submission

!cp -r $DIR/deepliif $DIR_job_submission
!cp $DIR/train.py $DIR_job_submission
!cp $DIR/train_command.py $DIR_job_submission
!cp $DIR/monitor_gpu.sh $DIR_job_submission
!cp $DIR/custom_save.py $DIR_job_submission

env: DIR_job_submission=/userfs/job_submission


In [39]:
%%writefile /userfs/conf_distPyTorch.py
import os
import torch.distributed as dist
def init_process():
    dist.init_process_group(
        backend='nccl',
        init_method='tcp://' + os.environ['MASTER_ADDR'] + ':' + os.environ['MASTER_PORT'],
        rank=int(os.environ['RANK']),
        world_size=int(os.environ['WORLD_SIZE']))
    
print('------ initiate process group... ------')
init_process()

Overwriting /userfs/conf_distPyTorch.py


In [40]:
# cat cannot change file in place, so we create a new one and use it to overwrite train.py
!cat /userfs/conf_distPyTorch.py $DIR_job_submission/train.py > $DIR_job_submission/train_edited.py
!mv $DIR_job_submission/train_edited.py $DIR_job_submission/train.py

In [41]:
!python dlicmd.py --exec-start distPyTorch --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN \
                  --msd-env USER_ACCESS_TOKEN=$USER_ACCESS_TOKEN --msd-env BASE_URL=$BASE_URL \
                  --numWorker 1 --workerMemory 8g \
                  --model-dir $DIR_job_submission --model-main train_command.py \
                  --cs-datastore-meta type=fs,data_path=DeepLIIF_Datasets_Full/

Copying files and directories ...
Content size: 92.0K
{
  "execId": "wmla-ns-351",
  "appId": "wmla-ns-351"
}


Run example code:

In [None]:
# %env DIR_job_submission=/userfs/job_submission

# !rm -rf $DIR_job_submission
# !mkdir -p $DIR_job_submission

# !cp -r $DIR/wmla-tutorial/pytorch_mnist_dist.py $DIR_job_submission

In [None]:
# !python dlicmd.py --exec-start distPyTorch --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN \
#                   --msd-env USER_ACCESS_TOKEN=$USER_ACCESS_TOKEN --msd-env BASE_URL=$BASE_URL \
#                   --numWorker 1 --workerMemory 8g \
#                   --model-dir $DIR_job_submission --model-main pytorch_mnist_dist.py \

### Delete Jobs (and associated results/logs)
#### delete one job

In [None]:
# !python dlicmd.py --exec-delete $NAMESPACE-38 --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN 

#### delete multiple jobs in a loop

In [11]:
for i in range(330,331):
    %env i=$i
    !python dlicmd.py --exec-delete $NAMESPACE-$i --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN 

env: i=330
Delete exec wmla-ns-330 succeed. HTTP:204 


### Get Job Status

In [None]:
# !python dlicmd.py --exec-get $NAMESPACE-10 --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN

### Get Job Log
#### last 10 lines

In [16]:
!python dlicmd.py --exec-outlogs $NAMESPACE-341 --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN

Executor 1 stdout
*Task <1> SubProcess*: |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
*Task <1> SubProcess*: |        ID   ID                                                   Usage      |
*Task <1> SubProcess*: |  No running processes found                                                 |
*Task <1> SubProcess*: +-----------------------------------------------------------------------------+
*Task <1> SubProcess*: 2022-01-10 17:57:45.312919 39 INFO NVIDIA_VISIBLE_DEVICES=GPU-8de3e9b9-4954-d73e-7705-14111d08c685
*Task <1> SubProcess*: 2022-01-10 17:57:45.327400 39 INFO NVIDIA_DRIVER_CAPABILITIES=compute,utility
*Task <1> SubProcess*: 2022-01-10 17:57:45.358698 39 INFO DATA_SOURCE_RESERVE=false
*Task <1> SubProcess*: 2022-01-10 17:57:45.371814 39 INFO Clean up data source
*Task <1> SubProcess*: 2022-01-10 17:57:45.391655 39 INFO Command exit with 0


Executor 2 stdout
*Task <2> SubProcess*: |  GPU   GI   CI        PID   Type   Process name              

#### full log files

In [None]:
# !python dlicmd.py --exec-trainerrlogs $NAMESPACE-134 --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN

In [24]:
# %env FN_LOG=wmla train 20220104 num_threads0,batch_size3,num_workers16,ddp msk.log
%env FN_LOG=deterministic training test.log
%env DIR_LOG=/userfs/log
# %env JOB_ID=

env: FN_LOG=deterministic training test.log
env: DIR_LOG=/userfs/log


In [30]:
!python dlicmd.py --exec-trainoutlogs $NAMESPACE-342 --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN > $DIR_LOG/"$FN_LOG"
!cat $DIR_LOG/"$FN_LOG" | grep \(epoch:

(epoch: 0, iters: 100, time: 1.771, data: 0.186) G_GAN_1: 1.364 G_L1_1: 1.684 D_real_1: 0.467 D_fake_1: 0.663 G_GAN_2: 0.856 G_L1_2: 1.548 D_real_2: 0.965 D_fake_2: 0.520 G_GAN_3: 1.087 G_L1_3: 1.151 D_real_3: 0.556 D_fake_3: 0.500 G_GAN_4: 1.198 G_L1_4: 0.667 D_real_4: 0.628 D_fake_4: 0.695 G_GAN_5: 1.052 G_L1_5: 16.801 D_real_5: 0.099 D_fake_5: 0.109 
(epoch: 0, iters: 100, time: 1.783, data: 0.089) G_GAN_1: 1.082 G_L1_1: 1.503 D_real_1: 0.627 D_fake_1: 0.791 G_GAN_2: 0.949 G_L1_2: 1.917 D_real_2: 0.659 D_fake_2: 0.523 G_GAN_3: 0.649 G_L1_3: 0.783 D_real_3: 0.877 D_fake_3: 0.452 G_GAN_4: 0.984 G_L1_4: 0.287 D_real_4: 0.570 D_fake_4: 0.880 G_GAN_5: 1.076 G_L1_5: 20.065 D_real_5: 0.236 D_fake_5: 0.235 


In [15]:
!tail -n30 $DIR_LOG/"$FN_LOG"

Error 400: Cannot find exec id $NAMESPACE-341.



In [None]:
!python dlicmd.py --app-outlogs $NAMESPACE-253 --rest-host $HOST --rest-port -1 --jwt-token $USER_ACCESS_TOKEN