# Train, hyperparameter tune with PyTorch

In [1]:
import os
import shutil

import azureml
from azureml.core import Workspace, Experiment
from azureml.widgets import RunDetails
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import PyTorch

from dotenv import set_key, get_key, find_dotenv
from utilities import get_auth

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.0.41


In [2]:
env_path = find_dotenv(raise_error_if_not_found=True)

In [3]:
ws = Workspace.from_config(auth=get_auth(env_path))
print(ws.name, ws.resource_group, ws.location, sep="\n")

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


fboyluamlws
fboyluamlrg
eastus


In [4]:
experiment = Experiment(workspace=ws, name='torchvision')

Let's copy the training script and its dependencies to a script folder.

In [5]:
script_folder = './torchdetect'
os.makedirs(script_folder, exist_ok=True)

shutil.copy('./scripts/coco_eval.py',script_folder)
shutil.copy('./scripts/coco_utils.py',script_folder)
shutil.copy('./scripts/engine.py',script_folder)
shutil.copy('./scripts/transforms.py',script_folder)
shutil.copy('./scripts/utils.py',script_folder)
shutil.copy('./scripts/maskrcnn_model.py',script_folder)
shutil.copy('./scripts/XMLDataset.py',script_folder)
shutil.copy('./scripts/train.py',script_folder)

'./torchdetect/train.py'

## Upload dataset to default datastore

In [6]:
ds = ws.get_default_datastore()

In [7]:
ds.container_name

'azureml-blobstore-a29dc687-5001-4ee4-ac74-7c17b122f449'

In [2]:
ds.upload(src_dir='./scripts/JPEGImages', target_path='JPEGImages', overwrite=True, show_progress=True)
ds.upload(src_dir='./scripts/Annotations', target_path='Annotations', overwrite=True, show_progress=True)

## Create  AmlCompute¶ 

We need a compute target for training the model. Here, we create [AmlCompute](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute) as our training compute resource to automate the process of hyperparameter tuning later using this resource.

In [8]:
# choose a name for your cluster
cluster_name = "YOUR_AMLCOMPUTE_CLUSTER_NAME"

cluster_name = "fboyluamlgpuclus"

In [9]:
set_key(env_path, "cluster_name", cluster_name)

(True, 'cluster_name', 'fboyluamlgpuclus')

In [16]:
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                           max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Found existing compute target.
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-06-21T05:35:48.906000+00:00', 'errors': None, 'creationTime': '2019-06-11T18:48:22.478352+00:00', 'modifiedTime': '2019-06-11T18:49:10.788499+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}


## Create A Pytorch Estimator

In [10]:
script_folder = './torchdetect'
image_name = get_key(env_path, 'image_name')

In [11]:
from azureml.core.container_registry import ContainerRegistry

In [12]:
# point to an image in private ACR
image_registry_details = ContainerRegistry()
image_registry_details.address = get_key(env_path, 'acr_server_name')
image_registry_details.username = get_key(env_path, 'acr_username')
image_registry_details.password = get_key(env_path, 'acr_password')

In [17]:
script_params = {
    '--data_path': ds.as_mount(),
    '--workers': 8,
    '--learning_rate' : 0.005,
    '--epochs' : 4,
    '--anchor_sizes' : '16,32,64,128,256,512',
    '--anchor_aspect_ratios' : '0.25,0.5,1.0,2.0',
    '--rpn_nms_thresh' : 0.5,
    '--box_nms_thresh' : 0.3,
    '--box_score_thresh' : 0.10    
}

estimator = PyTorch(source_directory=script_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='train.py',
                    use_docker=True,
                    custom_docker_image=image_name,
                    image_registry_details=image_registry_details,
                    user_managed=True,
                    use_gpu=True)

framework_version is not specified, defaulting to version 1.1.


In [18]:
# estimator.run_config.environment.python.interpreter_path = '/data/anaconda/envs/torchdetectaml/bin/python'
# estimator.run_config.history.snapshot_project = False
estimator.run_config.environment.environment_variables["PYTHONPATH"] = "$PYTHONPATH:/cocoapi/PythonAPI/"

### Submit job

In [19]:
run = experiment.submit(estimator)
print(run)

Run(Experiment: torchvision,
Id: torchvision_1561663460_9a54ec0d,
Type: azureml.scriptrun,
Status: Starting)


In [20]:
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [21]:
# to get more details of your run
print(run.get_details())

{'runId': 'torchvision_1561663460_9a54ec0d', 'target': 'fboyluamlgpuclus', 'status': 'Queued', 'properties': {'azureml.runsource': 'experiment', 'ContentSnapshotId': '57867cea-1720-41cc-84d1-c22d2d42cf9f', 'azureml.git.repository_uri': 'https://github.com/Microsoft/HyperdriveDeepLearningHappyPathTutorial.git', 'mlflow.source.git.repoURL': 'https://github.com/Microsoft/HyperdriveDeepLearningHappyPathTutorial.git', 'azureml.git.branch': 'fboylu_pytorch', 'mlflow.source.git.branch': 'fboylu_pytorch', 'azureml.git.commit': '955e72ce63186f9a55cfdc34cf40b4e85c31f4a4', 'mlflow.source.git.commit': '955e72ce63186f9a55cfdc34cf40b4e85c31f4a4', 'azureml.git.dirty': 'False'}, 'runDefinition': {'script': 'train.py', 'arguments': ['--data_path', '$AZUREML_DATAREFERENCE_workspaceblobstore', '--workers', '8', '--learning_rate', '0.005', '--epochs', '5', '--anchor_sizes', '16,32,64,128,256,512', '--anchor_aspect_ratios', '0.25,0.5,1.0,2.0', '--rpn_nms_thresh', '0.5', '--box_nms_thresh', '0.3', '--box_sc

In [22]:
run.wait_for_completion(show_output=True)

RunId: torchvision_1561663460_9a54ec0d
Web View: https://mlworkspace.azure.ai/portal/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/fboyluamlrg/providers/Microsoft.MachineLearningServices/workspaces/fboyluamlws/experiments/torchvision/runs/torchvision_1561663460_9a54ec0d

Streaming azureml-logs/70_driver_log.txt

Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /root/.cache/torch/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth

0.0%
0.0%
0.0%
0.0%
0.0%
0.0%
0.0%
0.0%
0.0%
0.0%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.1%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.2%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.3%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.4%
0.5%
0.5%
0.5%


20.2%
20.2%
20.2%
20.2%
20.2%
20.2%
20.2%
20.2%
20.2%
20.2%
20.2%
20.2%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.3%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.4%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.5%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.6%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.7%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.8%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
20.9%
21.0%
21.0%
21.0

26.5%
26.5%
26.5%
26.5%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.6%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.7%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.8%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
26.9%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.0%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.1%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.2%
27.3%
27.3%
27.3%
27.3%
27.3%
27.3%
27.3%
27.3%
27.3%
27.3

44.0%
44.0%
44.0%
44.0%
44.0%
44.0%
44.0%
44.0%
44.0%
44.0%
44.0%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.1%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.2%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.3%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.4%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.5%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.6%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.7%
44.8%
44.8%
44.8%
44.8

60.2%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.3%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.4%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.5%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.6%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.7%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.8%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
60.9%
61.0%
61.0%
61.0%
61.0%
61.0%
61.0%
61.0%
61.0%
61.0%
61.0%
61.0%
61.0%
61.0%
61.0

79.5%
79.5%
79.5%
79.5%
79.5%
79.5%
79.5%
79.5%
79.5%
79.5%
79.5%
79.5%
79.5%
79.5%
79.5%
79.5%
79.5%
79.5%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.6%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.7%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.8%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
79.9%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.0%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.1%
80.2%
80.2%
80.2%
80.2%
80.2%
80.2%
80.2%
80.2%
80.2%
80.2%
80.2%
80.2%
80.2%
80.2%
80.2%
80.2%
80.2%
80.2

Epoch: [0]  [  0/424]  eta: 0:23:35  lr: 0.000017  loss: 1.4324 (1.4324)  loss_classifier: 0.3939 (0.3939)  loss_box_reg: 0.0033 (0.0033)  loss_objectness: 0.6973 (0.6973)  loss_rpn_box_reg: 0.3379 (0.3379)  time: 3.3381  data: 0.8438  max mem: 2807
Epoch: [0]  [ 10/424]  eta: 0:14:08  lr: 0.000135  loss: 1.2286 (1.2488)  loss_classifier: 0.3466 (0.3329)  loss_box_reg: 0.0106 (0.0133)  loss_objectness: 0.6919 (0.6929)  loss_rpn_box_reg: 0.2000 (0.2097)  time: 2.0491  data: 0.0796  max mem: 5336
Epoch: [0]  [ 20/424]  eta: 0:13:27  lr: 0.000253  loss: 1.1418 (1.1517)  loss_classifier: 0.1841 (0.2299)  loss_box_reg: 0.0106 (0.0187)  loss_objectness: 0.6933 (0.6938)  loss_rpn_box_reg: 0.2000 (0.2094)  time: 1.9325  data: 0.0038  max mem: 5926
Epoch: [0]  [ 30/424]  eta: 0:12:48  lr: 0.000371  loss: 0.9514 (1.0850)  loss_classifier: 0.0724 (0.1814)  loss_box_reg: 0.0076 (0.0185)  loss_objectness: 0.6926 (0.6921)  loss_rpn_box_reg: 0.1748 (0.1929)  time: 1.8951  data: 0.0050  max mem: 6217


Epoch: [0]  [330/424]  eta: 0:03:05  lr: 0.003914  loss: 0.3691 (0.6663)  loss_classifier: 0.0548 (0.0909)  loss_box_reg: 0.0419 (0.0697)  loss_objectness: 0.0514 (0.3042)  loss_rpn_box_reg: 0.2071 (0.2016)  time: 1.9718  data: 0.0059  max mem: 6501
Epoch: [0]  [340/424]  eta: 0:02:46  lr: 0.004032  loss: 0.3478 (0.6571)  loss_classifier: 0.0394 (0.0898)  loss_box_reg: 0.0349 (0.0694)  loss_objectness: 0.0515 (0.2972)  loss_rpn_box_reg: 0.1865 (0.2007)  time: 1.9393  data: 0.0058  max mem: 6501
Epoch: [0]  [350/424]  eta: 0:02:26  lr: 0.004150  loss: 0.3017 (0.6476)  loss_classifier: 0.0408 (0.0885)  loss_box_reg: 0.0410 (0.0690)  loss_objectness: 0.0550 (0.2903)  loss_rpn_box_reg: 0.1754 (0.1998)  time: 1.9901  data: 0.0059  max mem: 6501
Epoch: [0]  [360/424]  eta: 0:02:06  lr: 0.004268  loss: 0.3309 (0.6389)  loss_classifier: 0.0494 (0.0877)  loss_box_reg: 0.0533 (0.0689)  loss_objectness: 0.0528 (0.2838)  loss_rpn_box_reg: 0.1489 (0.1984)  time: 1.9855  data: 0.0059  max mem: 6501


Epoch: [1]  [160/424]  eta: 0:08:43  lr: 0.005000  loss: 0.3075 (0.3029)  loss_classifier: 0.0519 (0.0543)  loss_box_reg: 0.0504 (0.0586)  loss_objectness: 0.0393 (0.0417)  loss_rpn_box_reg: 0.1435 (0.1484)  time: 1.9025  data: 0.0056  max mem: 6525
Epoch: [1]  [170/424]  eta: 0:08:22  lr: 0.005000  loss: 0.2595 (0.3015)  loss_classifier: 0.0466 (0.0540)  loss_box_reg: 0.0544 (0.0584)  loss_objectness: 0.0393 (0.0418)  loss_rpn_box_reg: 0.1404 (0.1473)  time: 1.8743  data: 0.0056  max mem: 6525
Epoch: [1]  [180/424]  eta: 0:08:03  lr: 0.005000  loss: 0.2516 (0.2989)  loss_classifier: 0.0430 (0.0530)  loss_box_reg: 0.0521 (0.0575)  loss_objectness: 0.0346 (0.0413)  loss_rpn_box_reg: 0.1269 (0.1471)  time: 1.9624  data: 0.0057  max mem: 6525
Epoch: [1]  [190/424]  eta: 0:07:43  lr: 0.005000  loss: 0.2482 (0.2980)  loss_classifier: 0.0349 (0.0529)  loss_box_reg: 0.0443 (0.0572)  loss_objectness: 0.0272 (0.0407)  loss_rpn_box_reg: 0.1355 (0.1472)  time: 1.9803  data: 0.0057  max mem: 6525


Epoch: [2]  [  0/424]  eta: 0:16:57  lr: 0.005000  loss: 0.3972 (0.3972)  loss_classifier: 0.0640 (0.0640)  loss_box_reg: 0.0745 (0.0745)  loss_objectness: 0.0172 (0.0172)  loss_rpn_box_reg: 0.2414 (0.2414)  time: 2.4006  data: 0.2829  max mem: 6525
Epoch: [2]  [ 10/424]  eta: 0:14:25  lr: 0.005000  loss: 0.2972 (0.3008)  loss_classifier: 0.0446 (0.0542)  loss_box_reg: 0.0435 (0.0526)  loss_objectness: 0.0252 (0.0295)  loss_rpn_box_reg: 0.1627 (0.1646)  time: 2.0898  data: 0.0323  max mem: 6525
Epoch: [2]  [ 20/424]  eta: 0:13:24  lr: 0.005000  loss: 0.2530 (0.2817)  loss_classifier: 0.0446 (0.0498)  loss_box_reg: 0.0414 (0.0487)  loss_objectness: 0.0272 (0.0335)  loss_rpn_box_reg: 0.1330 (0.1496)  time: 1.9719  data: 0.0054  max mem: 6525
Epoch: [2]  [ 30/424]  eta: 0:13:00  lr: 0.005000  loss: 0.2733 (0.2885)  loss_classifier: 0.0502 (0.0504)  loss_box_reg: 0.0523 (0.0523)  loss_objectness: 0.0316 (0.0355)  loss_rpn_box_reg: 0.1330 (0.1503)  time: 1.9213  data: 0.0046  max mem: 6525


Epoch: [2]  [330/424]  eta: 0:03:05  lr: 0.005000  loss: 0.2271 (0.2417)  loss_classifier: 0.0374 (0.0459)  loss_box_reg: 0.0299 (0.0489)  loss_objectness: 0.0191 (0.0264)  loss_rpn_box_reg: 0.1213 (0.1206)  time: 1.9285  data: 0.0059  max mem: 6583
Epoch: [2]  [340/424]  eta: 0:02:45  lr: 0.005000  loss: 0.2027 (0.2405)  loss_classifier: 0.0295 (0.0457)  loss_box_reg: 0.0282 (0.0486)  loss_objectness: 0.0152 (0.0261)  loss_rpn_box_reg: 0.1094 (0.1201)  time: 1.9584  data: 0.0059  max mem: 6583
Epoch: [2]  [350/424]  eta: 0:02:25  lr: 0.005000  loss: 0.1952 (0.2393)  loss_classifier: 0.0298 (0.0454)  loss_box_reg: 0.0396 (0.0485)  loss_objectness: 0.0158 (0.0258)  loss_rpn_box_reg: 0.1010 (0.1195)  time: 2.0014  data: 0.0061  max mem: 6583
Epoch: [2]  [360/424]  eta: 0:02:06  lr: 0.005000  loss: 0.2512 (0.2400)  loss_classifier: 0.0473 (0.0458)  loss_box_reg: 0.0613 (0.0490)  loss_objectness: 0.0203 (0.0258)  loss_rpn_box_reg: 0.1069 (0.1194)  time: 2.0634  data: 0.0062  max mem: 6583


Epoch: [3]  [160/424]  eta: 0:08:41  lr: 0.000500  loss: 0.1529 (0.1713)  loss_classifier: 0.0323 (0.0386)  loss_box_reg: 0.0268 (0.0400)  loss_objectness: 0.0142 (0.0165)  loss_rpn_box_reg: 0.0693 (0.0763)  time: 1.9343  data: 0.0057  max mem: 6583
Epoch: [3]  [170/424]  eta: 0:08:23  lr: 0.000500  loss: 0.1771 (0.1739)  loss_classifier: 0.0342 (0.0394)  loss_box_reg: 0.0316 (0.0407)  loss_objectness: 0.0145 (0.0165)  loss_rpn_box_reg: 0.0790 (0.0773)  time: 2.0092  data: 0.0062  max mem: 6583
Epoch: [3]  [180/424]  eta: 0:08:02  lr: 0.000500  loss: 0.1586 (0.1719)  loss_classifier: 0.0338 (0.0390)  loss_box_reg: 0.0294 (0.0399)  loss_objectness: 0.0145 (0.0164)  loss_rpn_box_reg: 0.0797 (0.0766)  time: 1.9864  data: 0.0065  max mem: 6583
Epoch: [3]  [190/424]  eta: 0:07:42  lr: 0.000500  loss: 0.1586 (0.1712)  loss_classifier: 0.0347 (0.0388)  loss_box_reg: 0.0250 (0.0395)  loss_objectness: 0.0160 (0.0164)  loss_rpn_box_reg: 0.0795 (0.0765)  time: 1.9378  data: 0.0061  max mem: 6583


Epoch: [4]  [  0/424]  eta: 0:13:45  lr: 0.000500  loss: 0.1035 (0.1035)  loss_classifier: 0.0239 (0.0239)  loss_box_reg: 0.0113 (0.0113)  loss_objectness: 0.0119 (0.0119)  loss_rpn_box_reg: 0.0564 (0.0564)  time: 1.9473  data: 0.2865  max mem: 6583
Epoch: [4]  [ 10/424]  eta: 0:13:51  lr: 0.000500  loss: 0.1339 (0.1735)  loss_classifier: 0.0310 (0.0392)  loss_box_reg: 0.0368 (0.0401)  loss_objectness: 0.0119 (0.0163)  loss_rpn_box_reg: 0.0818 (0.0779)  time: 2.0078  data: 0.0299  max mem: 6583
Epoch: [4]  [ 20/424]  eta: 0:13:22  lr: 0.000500  loss: 0.1339 (0.1563)  loss_classifier: 0.0303 (0.0339)  loss_box_reg: 0.0240 (0.0341)  loss_objectness: 0.0099 (0.0129)  loss_rpn_box_reg: 0.0800 (0.0754)  time: 1.9895  data: 0.0043  max mem: 6583


ExperimentExecutionException: The output streaming for the run interrupted.
But the run is still executing on the compute target. 
Details for canceling the run can be found here: https://aka.ms/aml-docs-cancel-run

## Tune Model Hyperparameters

In [25]:
from azureml.train.dnn import PyTorch

script_params = {
    '--data_path': ds.as_mount(),
    '--workers': 8,
    '--epochs' : 8,
    '--box_nms_thresh' : 0.3,
    '--box_score_thresh' : 0.10    
}

estimator = PyTorch(source_directory=script_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='train.py',
                    use_docker=True,
                    custom_docker_image=image_name,
                    image_registry_details=image_registry_details,
                    user_managed=True,
                    use_gpu=True)

estimator.run_config.environment.environment_variables["PYTHONPATH"] = "$PYTHONPATH:/cocoapi/PythonAPI/"

framework_version is not specified, defaulting to version 1.1.


In [26]:
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, uniform, choice, HyperDriveConfig, PrimaryMetricGoal

In [27]:
param_sampling = RandomParameterSampling( {
        'learning_rate': uniform(0.0005, 0.005),
        'rpn_nms_thresh': uniform(0.3, 0.7),
        'anchor_sizes': choice('16', '16,32', '16,32,64', '16,32,64,128', '16,32,64,128,256', '16,32,64,128,256,512'),
        'anchor_aspect_ratios': choice('0.25', '0.25,0.5', '0.25,0.5,1.0', '0.25,0.5,1.0,2.0')
    }
)

In [28]:
max_total_runs = 4

In [29]:
early_termination_policy = BanditPolicy(slack_factor=0.15, evaluation_interval=2, delay_evaluation=2)

hyperdrive_config = HyperDriveConfig(estimator=estimator,
                                     hyperparameter_sampling=param_sampling, 
                                     policy=early_termination_policy,
                                     primary_metric_name='mAP@IoU=0.50',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=max_total_runs,
                                     max_concurrent_runs=4)

In [30]:
hyperdrive_run = experiment.submit(hyperdrive_config)

In [31]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [32]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: torchvision_1561667693326
Web View: https://mlworkspace.azure.ai/portal/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/fboyluamlrg/providers/Microsoft.MachineLearningServices/workspaces/fboyluamlws/experiments/torchvision/runs/torchvision_1561667693326

Streaming azureml-logs/hyperdrive.txt

"<START>[2019-06-27T20:34:53.810711][API][INFO]Experiment created<END>\n""<START>[2019-06-27T20:34:54.723016][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2019-06-27T20:34:54.920213][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2019-06-27T20:34:55.4655731Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2019-06-27T20:35:24.744216][GENERATOR][INFO]Max number of jobs '4' reached for experiment.<END>\n""<START>[2019-06-27T20:35:24.971196][GENERATOR][INFO]All jobs generated.<END>\n"<START>[20

{'runId': 'torchvision_1561667693326',
 'target': 'fboyluamlgpuclus',
 'status': 'Completed',
 'startTimeUtc': '2019-06-27T20:34:53.644428Z',
 'endTimeUtc': '2019-06-27T22:41:44.251618Z',
 'properties': {'primary_metric_config': '{"name": "mAP@IoU=0.50", "goal": "maximize"}',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'baggage': 'eyJvaWQiOiAiMDE5MmNhNDMtZTE3OS00NWE2LTg5ZjQtZjQ4YWViZGNjNTIyIiwgInRpZCI6ICI3MmY5ODhiZi04NmYxLTQxYWYtOTFhYi0yZDdjZDAxMWRiNDciLCAidW5hbWUiOiAiMDRiMDc3OTUtOGRkYi00NjFhLWJiZWUtMDJmOWUxYmY3YjQ2In0',
  'ContentSnapshotId': '57867cea-1720-41cc-84d1-c22d2d42cf9f'},
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://fboyluamstorage3fdfe5af9.blob.core.windows.net/azureml/ExperimentRun/dcid.torchvision_1561667693326/azureml-logs/hyperdrive.txt?sv=2018-03-28&sr=b&sig=lUZ00Ds1XwBaZNWp65gKZkpScvtwkNFXFZ59pXKmCkA%3D&st=2019-06-27T22%3A31%3A45Z&se=2019-06-28T06%3A41%3A45Z&sp=r'}}