# Train, hyperparameter tune with PyTorch

In [1]:
import os
import shutil

import azureml
from azureml.core import Workspace, Experiment
from azureml.widgets import RunDetails
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.train.dnn import PyTorch

from dotenv import set_key, get_key, find_dotenv
from utilities import get_auth

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.0.41


In [2]:
env_path = find_dotenv(raise_error_if_not_found=True)

In [3]:
ws = Workspace.from_config(auth=get_auth(env_path))
print(ws.name, ws.resource_group, ws.location, sep="\n")

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


fboyluamlws
fboyluamlrg
eastus


In [4]:
experiment = Experiment(workspace=ws, name='torchvision')

Let's copy the training script and its dependencies to a script folder.

In [5]:
script_folder = './torchdetect'
os.makedirs(script_folder, exist_ok=True)

shutil.copy('./scripts/coco_eval.py',script_folder)
shutil.copy('./scripts/coco_utils.py',script_folder)
shutil.copy('./scripts/engine.py',script_folder)
shutil.copy('./scripts/transforms.py',script_folder)
shutil.copy('./scripts/utils.py',script_folder)
shutil.copy('./scripts/maskrcnn_model.py',script_folder)
shutil.copy('./scripts/XMLDataset.py',script_folder)
shutil.copy('./scripts/train.py',script_folder)

'./torchdetect/train.py'

## Upload dataset to default datastore

In [6]:
ds = ws.get_default_datastore()

In [7]:
ds.container_name

'azureml-blobstore-a29dc687-5001-4ee4-ac74-7c17b122f449'

In [2]:
ds.upload(src_dir='./scripts/JPEGImages', target_path='JPEGImages', overwrite=True, show_progress=True)
ds.upload(src_dir='./scripts/Annotations', target_path='Annotations', overwrite=True, show_progress=True)

## Create  AmlCompute¶ 

We need a compute target for training the model. Here, we create [AmlCompute](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute) as our training compute resource to automate the process of hyperparameter tuning later using this resource.

In [8]:
# choose a name for your cluster
cluster_name = "YOUR_AMLCOMPUTE_CLUSTER_NAME"


In [9]:
set_key(env_path, "cluster_name", cluster_name)

(True, 'cluster_name', 'fboyluamlgpuclus')

In [16]:
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                           max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Found existing compute target.
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-06-21T05:35:48.906000+00:00', 'errors': None, 'creationTime': '2019-06-11T18:48:22.478352+00:00', 'modifiedTime': '2019-06-11T18:49:10.788499+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}


## Create A Pytorch Estimator

In [10]:
script_folder = './torchdetect'
image_name = get_key(env_path, 'image_name')

In [11]:
from azureml.core.container_registry import ContainerRegistry

In [12]:
# point to an image in private ACR
image_registry_details = ContainerRegistry()
image_registry_details.address = get_key(env_path, 'acr_server_name')
image_registry_details.username = get_key(env_path, 'acr_username')
image_registry_details.password = get_key(env_path, 'acr_password')

In [17]:
script_params = {
    '--data_path': ds.as_mount(),
    '--workers': 8,
    '--learning_rate' : 0.005,
    '--epochs' : 4,
    '--anchor_sizes' : '16,32,64,128,256,512',
    '--anchor_aspect_ratios' : '0.25,0.5,1.0,2.0',
    '--rpn_nms_thresh' : 0.5,
    '--box_nms_thresh' : 0.3,
    '--box_score_thresh' : 0.10    
}

estimator = PyTorch(source_directory=script_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='train.py',
                    use_docker=True,
                    custom_docker_image=image_name,
                    image_registry_details=image_registry_details,
                    user_managed=True,
                    use_gpu=True)

framework_version is not specified, defaulting to version 1.1.


In [18]:
# estimator.run_config.environment.python.interpreter_path = '/data/anaconda/envs/torchdetectaml/bin/python'
# estimator.run_config.history.snapshot_project = False
estimator.run_config.environment.environment_variables["PYTHONPATH"] = "$PYTHONPATH:/cocoapi/PythonAPI/"

### Submit job

In [19]:
run = experiment.submit(estimator)
print(run)

Run(Experiment: torchvision,
Id: torchvision_1561663460_9a54ec0d,
Type: azureml.scriptrun,
Status: Starting)


In [20]:
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [21]:
# to get more details of your run
print(run.get_details())

{'runId': 'torchvision_1561663460_9a54ec0d', 'target': 'fboyluamlgpuclus', 'status': 'Queued', 'properties': {'azureml.runsource': 'experiment', 'ContentSnapshotId': '57867cea-1720-41cc-84d1-c22d2d42cf9f', 'azureml.git.repository_uri': 'https://github.com/Microsoft/HyperdriveDeepLearningHappyPathTutorial.git', 'mlflow.source.git.repoURL': 'https://github.com/Microsoft/HyperdriveDeepLearningHappyPathTutorial.git', 'azureml.git.branch': 'fboylu_pytorch', 'mlflow.source.git.branch': 'fboylu_pytorch', 'azureml.git.commit': '955e72ce63186f9a55cfdc34cf40b4e85c31f4a4', 'mlflow.source.git.commit': '955e72ce63186f9a55cfdc34cf40b4e85c31f4a4', 'azureml.git.dirty': 'False'}, 'runDefinition': {'script': 'train.py', 'arguments': ['--data_path', '$AZUREML_DATAREFERENCE_workspaceblobstore', '--workers', '8', '--learning_rate', '0.005', '--epochs', '5', '--anchor_sizes', '16,32,64,128,256,512', '--anchor_aspect_ratios', '0.25,0.5,1.0,2.0', '--rpn_nms_thresh', '0.5', '--box_nms_thresh', '0.3', '--box_sc

In [33]:
run.wait_for_completion(show_output=True)

## Tune Model Hyperparameters

In [25]:
from azureml.train.dnn import PyTorch

script_params = {
    '--data_path': ds.as_mount(),
    '--workers': 8,
    '--epochs' : 8,
    '--box_nms_thresh' : 0.3,
    '--box_score_thresh' : 0.10    
}

estimator = PyTorch(source_directory=script_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='train.py',
                    use_docker=True,
                    custom_docker_image=image_name,
                    image_registry_details=image_registry_details,
                    user_managed=True,
                    use_gpu=True)

estimator.run_config.environment.environment_variables["PYTHONPATH"] = "$PYTHONPATH:/cocoapi/PythonAPI/"

framework_version is not specified, defaulting to version 1.1.


In [26]:
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, uniform, choice, HyperDriveConfig, PrimaryMetricGoal

In [27]:
param_sampling = RandomParameterSampling( {
        'learning_rate': uniform(0.0005, 0.005),
        'rpn_nms_thresh': uniform(0.3, 0.7),
        'anchor_sizes': choice('16', '16,32', '16,32,64', '16,32,64,128', '16,32,64,128,256', '16,32,64,128,256,512'),
        'anchor_aspect_ratios': choice('0.25', '0.25,0.5', '0.25,0.5,1.0', '0.25,0.5,1.0,2.0')
    }
)

In [28]:
max_total_runs = 4

In [29]:
early_termination_policy = BanditPolicy(slack_factor=0.15, evaluation_interval=2, delay_evaluation=2)

hyperdrive_config = HyperDriveConfig(estimator=estimator,
                                     hyperparameter_sampling=param_sampling, 
                                     policy=early_termination_policy,
                                     primary_metric_name='mAP@IoU=0.50',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=max_total_runs,
                                     max_concurrent_runs=4)

In [30]:
hyperdrive_run = experiment.submit(hyperdrive_config)

In [31]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [32]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: torchvision_1561667693326
Web View: https://mlworkspace.azure.ai/portal/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/fboyluamlrg/providers/Microsoft.MachineLearningServices/workspaces/fboyluamlws/experiments/torchvision/runs/torchvision_1561667693326

Streaming azureml-logs/hyperdrive.txt

"<START>[2019-06-27T20:34:53.810711][API][INFO]Experiment created<END>\n""<START>[2019-06-27T20:34:54.723016][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2019-06-27T20:34:54.920213][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2019-06-27T20:34:55.4655731Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>"<START>[2019-06-27T20:35:24.744216][GENERATOR][INFO]Max number of jobs '4' reached for experiment.<END>\n""<START>[2019-06-27T20:35:24.971196][GENERATOR][INFO]All jobs generated.<END>\n"<START>[20

{'runId': 'torchvision_1561667693326',
 'target': 'fboyluamlgpuclus',
 'status': 'Completed',
 'startTimeUtc': '2019-06-27T20:34:53.644428Z',
 'endTimeUtc': '2019-06-27T22:41:44.251618Z',
 'properties': {'primary_metric_config': '{"name": "mAP@IoU=0.50", "goal": "maximize"}',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'baggage': 'eyJvaWQiOiAiMDE5MmNhNDMtZTE3OS00NWE2LTg5ZjQtZjQ4YWViZGNjNTIyIiwgInRpZCI6ICI3MmY5ODhiZi04NmYxLTQxYWYtOTFhYi0yZDdjZDAxMWRiNDciLCAidW5hbWUiOiAiMDRiMDc3OTUtOGRkYi00NjFhLWJiZWUtMDJmOWUxYmY3YjQ2In0',
  'ContentSnapshotId': '57867cea-1720-41cc-84d1-c22d2d42cf9f'},
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://fboyluamstorage3fdfe5af9.blob.core.windows.net/azureml/ExperimentRun/dcid.torchvision_1561667693326/azureml-logs/hyperdrive.txt?sv=2018-03-28&sr=b&sig=lUZ00Ds1XwBaZNWp65gKZkpScvtwkNFXFZ59pXKmCkA%3D&st=2019-06-27T22%3A31%3A45Z&se=2019-06-28T06%3A41%3A45Z&sp=r'}}