Skip to content

Commit 7cf6094

Browse files
committed
Mask R-CNN Best Practice Tutorial
0 parents  commit 7cf6094

18 files changed

+2689
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__pycache__

README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
![NAG Logo](https://www.nag.com/themes/custom/nag/logo.png)
2+
3+
# AzureML Best Practice Reference Implementations:
4+
5+
More best practice references will be added as the Collaboration Center work progresses.
6+
7+
### Mask R-CNN: End to end training workflow with mounted Blob storage
8+
9+
This tutorial can be found in the [maskrcnn](maskrcnn) directory. It demonstrates the deployment
10+
of and end-to-end workflow for training a distributed model using a file-based dataset stored in
11+
Azure Blob storage. We use the Mask R-CNN model as a demonstration workload.
12+

maskrcnn/Dockerfile

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Build image on top of NVidia MXnet image
2+
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.02-py3
3+
FROM ${FROM_IMAGE_NAME}
4+
5+
# Pin Key Package Versions
6+
ENV MOFED_VER 5.0-2.1.8.0
7+
ENV AZML_SDK_VER 1.25.0
8+
9+
# Other required variables for MOFED drivers
10+
ENV OS_VER ubuntu20.04
11+
ENV PLATFORM x86_64
12+
13+
### Install Mellanox Drivers ###
14+
RUN apt-get update && apt-get install -y libcap2 libfuse-dev && \
15+
wget --quiet http://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
16+
tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
17+
MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --user-space-only --without-fw-update --all --without-neohost-backend --force
18+
19+
20+
### Install Python Dependencies ###
21+
RUN pip install azureml-defaults==${AZML_SDK_VER}
22+
23+
### Custom additions for specific training ###
24+
25+
# !!!! INSERT YOUR REQUIRED PACKAGE INSTALLATIONS HERE !!!!
26+
# Note that the NVIDIA DeepLearningExamples Mask R-CNN implementation is pre-installed
27+
28+
# Create required mountpoints
29+
RUN mkdir -p /work
30+
RUN mkdir -p /data
31+
RUN mkdir -p /result
32+
33+
# Finally configure the workspace
34+
WORKDIR /work
35+
ENV OMP_NUM_THREADS=1

maskrcnn/README.html

Lines changed: 220 additions & 0 deletions
Large diffs are not rendered by default.

maskrcnn/README.md

Lines changed: 528 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
5+
from azureml.core import Experiment, ScriptRunConfig
6+
from azureml.core.runconfig import MpiConfiguration
7+
8+
from common import (
9+
get_or_create_workspace,
10+
create_or_update_environment,
11+
create_or_update_cluster,
12+
)
13+
14+
import sharedconfig
15+
16+
17+
k_runclass = "Mount"
18+
19+
20+
def generate_training_opts(num_gpus, ims_per_gpu, max_iter, per_epoch_eval=False):
21+
"""Populate common Mask RCNN command line options
22+
"""
23+
opts = ["--config-file", "./benchmark_mask_rcnn_R_50_FPN.yaml"]
24+
opts.extend(["SOLVER.IMS_PER_BATCH", str(num_gpus * ims_per_gpu)])
25+
opts.extend(["SOLVER.MAX_ITER", str(max_iter)])
26+
opts.extend(["TEST.IMS_PER_BATCH", str(num_gpus * ims_per_gpu)])
27+
opts.extend(["PER_EPOCH_EVAL", str(bool(per_epoch_eval))])
28+
29+
return opts
30+
31+
32+
def parse_command_line_args():
33+
"""Parse command line arguments and return args object
34+
"""
35+
parser = argparse.ArgumentParser(
36+
description="Submit benchmark runs using mounted blob"
37+
)
38+
39+
parser.add_argument("num_nodes", type=int, help="Number of nodes")
40+
tiers = parser.add_mutually_exclusive_group()
41+
tiers.add_argument("--premium", action="store_true", help="Use premium storage")
42+
tiers.add_argument("--cool", action="store_true", help="Use cool storage")
43+
parser.add_argument("--follow", action="store_true", help="Follow run output")
44+
parser.add_argument(
45+
"--iter",
46+
type=int,
47+
default=sharedconfig.max_iter,
48+
help="Number of training iterations",
49+
)
50+
51+
return parser.parse_args()
52+
53+
54+
def main():
55+
56+
# Collect command line arguments
57+
args = parse_command_line_args()
58+
59+
# Collect runclass and default (hot) dataset name
60+
runclass = k_runclass
61+
dataset = sharedconfig.dataset_hot
62+
63+
# Replace/update args for using premium storage
64+
if args.premium:
65+
runclass += "_premium"
66+
dataset = sharedconfig.dataset_premium
67+
68+
# Replace/update args for using cool storage
69+
if args.cool:
70+
runclass += "_cool"
71+
dataset = sharedconfig.dataset_cool
72+
73+
# Get the AzureML Workspace object
74+
workspace = get_or_create_workspace(
75+
sharedconfig.subscription,
76+
sharedconfig.resource_group,
77+
sharedconfig.workspace_name,
78+
)
79+
80+
# Get and update the ClusterConnector object
81+
# NOTE: This is *NOT* an azureml.core.compute.AmlCompute object but a wrapper
82+
# See clusterconnector.py for more details
83+
clusterconnector = create_or_update_cluster(
84+
workspace,
85+
sharedconfig.cluster_name,
86+
args.num_nodes,
87+
sharedconfig.ssh_key,
88+
sharedconfig.vm_type,
89+
terminate_on_failure=True,
90+
use_beeond=False,
91+
)
92+
93+
# Get and update the AzureML Environment object
94+
environment = create_or_update_environment(
95+
workspace, sharedconfig.environment_name, sharedconfig.docker_image
96+
)
97+
98+
# Get/Create an experiment object
99+
experiment = Experiment(workspace=workspace, name=sharedconfig.experiment_name)
100+
101+
# Configure the distributed compute settings
102+
pytorchconfig = MpiConfiguration(
103+
node_count=args.num_nodes, process_count_per_node=sharedconfig.gpus_per_node
104+
)
105+
106+
# Collect arguments to be passed to training script
107+
script_args = ["--dataset", dataset]
108+
script_args.extend(
109+
generate_training_opts(
110+
args.num_nodes * sharedconfig.gpus_per_node,
111+
sharedconfig.ims_per_gpu,
112+
args.iter,
113+
)
114+
)
115+
script_args.extend(["PATHS_CATALOG", "./dataset_catalog.py"])
116+
117+
# Define the configuration for running the training script
118+
script_conf = ScriptRunConfig(
119+
source_directory="train",
120+
script="train_net_mount.py",
121+
compute_target=clusterconnector.cluster,
122+
environment=environment,
123+
arguments=script_args,
124+
distributed_job_config=pytorchconfig,
125+
)
126+
127+
# We can use these tags make a note of run parameters (avoids grepping the logs)
128+
runtags = {
129+
"class": runclass,
130+
"vmtype": sharedconfig.vm_type,
131+
"num_nodes": args.num_nodes,
132+
"ims_per_gpu": sharedconfig.ims_per_gpu,
133+
"iter": args.iter,
134+
}
135+
136+
# Submit the run
137+
run = experiment.submit(config=script_conf, tags=runtags)
138+
139+
# Can optionally choose to follow the output on the command line
140+
if args.follow:
141+
run.wait_for_completion(show_output=True)
142+
143+
144+
if __name__ == "__main__":
145+
main()

0 commit comments

Comments
 (0)