# Mask R-CNN Distributed training on Azure Machine Learning




In [69]:
%load_ext autoreload
%autoreload 2


import os
from azureml.core import (Workspace, Experiment, 
                          VERSION, Datastore)
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.train.estimator import Estimator, Mpi
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails


SUBSCRIPTION_ID = ""
RESOURCE_GROUP = ""
WORKSPACE_NAME = ""
EXP_NAME = 'Azureml-maskRCNN'
CLUSTER_NAME = "gpu-cluster"
NODE_COUNT = 8
TRAINING_DIR = os.path.join(os.getcwd(),'horovod')



print("SDK version:", VERSION)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
SDK version: 1.1.5


In [70]:
ws = Workspace(subscription_id = SUBSCRIPTION_ID, 
               resource_group =RESOURCE_GROUP , 
               workspace_name = WORKSPACE_NAME
              )


exp = Experiment(workspace=ws, name=EXP_NAME)

In [71]:
found = False
cts = ws.compute_targets
if CLUSTER_NAME in cts and cts[CLUSTER_NAME].type == 'AmlCompute':
    found = True
    print('Found existing compute target.')
    compute_target = cts[CLUSTER_NAME]

if not found:
    print('Creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size =  'Standard_NC12',max_nodes = NODE_COUNT)

    # Create the cluster.\n",
    compute_target = ComputeTarget.create(ws, CLUSTER_NAME, provisioning_config)

print('Checking cluster status...')
compute_target.wait_for_completion(show_output = True, min_node_count = None, timeout_in_minutes = 20)

Found existing compute target.
Checking cluster status...
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [72]:
%%writefile run.py

import os
import shutil,glob
import argparse
from git.repo.base import Repo


SRC_DIR = '/maskrcnn'
SAMPLES_DIR = os.path.join(SRC_DIR,'samples')
LOGS_DIR = os.path.join(os.getcwd(),'logs')
os.makedirs(LOGS_DIR, exist_ok = True)

parser = argparse.ArgumentParser()
parser.add_argument('--is_distributed', type=bool,help='Distributed training')

args = parser.parse_args()
is_distributed = args.is_distributed

#=====Clone distributed training implementation of Mask_RCNN==========

REPO_URL="https://github.com/datashinobi/Mask_RCNN.git"
BRANCH='yassine/horovod'

if os.path.exists(SRC_DIR):
    print("Repo exists, skip cloning")
else:
    print('Clonerepo..........')
    Repo.clone_from(REPO_URL,SRC_DIR, branch=BRANCH)

#=====move training code to source dir=====
shutil.copytree(os.path.join(os.getcwd(),'horovod'), os.path.join(SAMPLES_DIR,'horovod'))  

os.chdir(os.path.join(SAMPLES_DIR,'horovod'))

from train import run
run(is_distributed,LOGS_DIR)


Overwriting run.py


In [73]:
%%writefile $TRAINING_DIR/train.py

import os
ROOT_DIR = os.path.abspath("../../")

def run(is_distributed,logs_dir):
    from dataset import ShapesDataset
    from mrcnn.config import Config
    
    ######################
    class ShapesConfig(Config):
        NAME = "shapes"
        GPU_COUNT = 2
        IMAGES_PER_GPU =2 
        NUM_CLASSES = 1 + 3
        IMAGE_MIN_DIM = 128
        IMAGE_MAX_DIM = 128
        RPN_ANCHOR_SCALES = (8, 16, 32, 64, 128)
        TRAIN_ROIS_PER_IMAGE = 32
        STEPS_PER_EPOCH = 10
        VALIDATION_STEPS = 5

    config = ShapesConfig()
    config.display()
    
    # Training dataset
    dataset_train = ShapesDataset()
    dataset_train.load_shapes(500000, config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1])
    dataset_train.prepare()

    # Validation dataset
    dataset_val = ShapesDataset()
    dataset_val.load_shapes(5000, config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1])
    dataset_val.prepare()
    if is_distributed:
        import mrcnn.distributed_model as modellib
    else:
        import mrcnn.model as modellib
        
    from mrcnn import utils

    # Local path to trained weights file
    COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5")

    # Download COCO trained weights from Releases if needed
    if not os.path.exists(COCO_MODEL_PATH):
        utils.download_trained_weights(COCO_MODEL_PATH)


    # number of found devices by TF
    from tensorflow.python.client import device_lib
    device_lib.list_local_devices()
    
    # Create model in training mode
    model = modellib.MaskRCNN("training", config, logs_dir)

    # Load weights trained on MS COCO, but skip layers that
        # are different due to the different number of classes
        # See README  @ https://github.com/matterport/Mask_RCNNfor instructions to download the COCO weights
    model.load_weights(COCO_MODEL_PATH, by_name=True,
                       exclude=["mrcnn_class_logits", "mrcnn_bbox_fc", 
                                "mrcnn_bbox", "mrcnn_mask"])


    
    model.train(dataset_train, dataset_val, 
                learning_rate=config.LEARNING_RATE, 
                epochs=1000, 
                layers='heads')

Overwriting /extdrive1/home/sasuke/dev/amlsamples/maskRCNN_distributed/horovod/train.py


In [74]:
script_params = {'--is_distributed':True}

from azureml.train.dnn import TensorFlow 
estimator = TensorFlow(source_directory=os.getcwd(),
                       compute_target=compute_target,
                       entry_script='run.py',
                       pip_requirements_file="requirements.txt",
                       node_count=NODE_COUNT,
                       distributed_training=Mpi(),
                       use_gpu=True,
                       script_params=script_params,
                       framework_version='1.10'
                      )

In [76]:
run = exp.submit(estimator)
run

Experiment,Id,Type,Status,Details Page,Docs Page
Azureml-maskRCNN,Azureml-maskRCNN_1585167843_7b6e658f,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


###  Start tensorboad to track run metrics

In [None]:
from azureml.tensorboard import Tensorboard

tb = Tensorboard([run])
tb.start()

In [None]:
tb.stop()