# Train, hyperparameter tune with PyTorch

## Upload dataset to default datastore


In [1]:
import azureml
from azureml.core import Workspace

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.0.41


In [1]:
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

In [3]:
from azureml.core import Experiment

experiment = Experiment(workspace=ws, name='torchvision')

In [4]:
import os
import shutil

script_folder = './torchdetect'
os.makedirs(script_folder, exist_ok=True)

shutil.copy('coco_eval.py',script_folder)
shutil.copy('coco_utils.py',script_folder)
shutil.copy('engine.py',script_folder)
shutil.copy('transforms.py',script_folder)
shutil.copy('utils.py',script_folder)
shutil.copy('coco_utils.py',script_folder)
shutil.copy('train.py',script_folder)

In [4]:
ds = ws.get_default_datastore()

In [5]:
ds.container_name

'azureml-blobstore-a29dc687-5001-4ee4-ac74-7c17b122f449'

In [2]:
ds.upload(src_dir='/datadrive/torchvisionOD/BuildData/JPEGImages', target_path='JPEGImages', overwrite=True, show_progress=True)
ds.upload(src_dir='/datadrive/torchvisionOD/BuildData/Annotations', target_path='Annotations', overwrite=True, show_progress=True)

## Create  AmlCompute¶ 

In [6]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cluster_name = "fboyluamlgpuclus"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                           max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Found existing compute target.
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-06-12T18:54:09.191000+00:00', 'errors': None, 'creationTime': '2019-06-11T18:48:22.478352+00:00', 'modifiedTime': '2019-06-11T18:49:10.788499+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}


## Create A Pytorch Estimator

In [7]:
script_folder = './torchdetect'
image_name = 'fboylu/torchdet'

In [8]:
from azureml.train.dnn import PyTorch

script_params = {
    '--data_path': ds.as_mount(),
    '--workers': 8,
    '--learning_rate' : 0.005,
    '--epochs' : 5,
    '--anchor_sizes' : '16,32,64,128,256,512',
    '--anchor_aspect_ratios' : '0.25,0.5,1.0,2.0',
    '--rpn_nms_thresh' : 0.5,
    '--box_nms_thresh' : 0.3,
    '--box_score_thresh' : 0.10    
}

estimator = PyTorch(source_directory=script_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='train.py',
                    use_docker=True,
                    custom_docker_image=image_name,
                    user_managed=True,
                    use_gpu=True)

framework_version is not specified, defaulting to version 1.1.


In [9]:
# estimator.run_config.environment.python.interpreter_path = '/data/anaconda/envs/torchdetectaml/bin/python'
# estimator.run_config.history.snapshot_project = False
estimator.run_config.environment.environment_variables["PYTHONPATH"] = "$PYTHONPATH:/cocoapi/PythonAPI/"

### Submit job

In [10]:
run = experiment.submit(estimator)
print(run)

Run(Experiment: torchvision,
Id: torchvision_1560369221_b26532e3,
Type: azureml.scriptrun,
Status: Queued)


In [11]:
from azureml.widgets import RunDetails

In [None]:
RunDetails(run).show()

In [13]:
# to get more details of your run
print(run.get_details())

{'runId': 'torchvision_1560369221_b26532e3', 'target': 'fboyluamlgpuclus', 'status': 'Queued', 'properties': {'azureml.runsource': 'experiment', 'ContentSnapshotId': 'bae6e356-2d31-4b30-bc32-14a0016258b3'}, 'runDefinition': {'script': 'train_str.py', 'arguments': ['--data_path', '$AZUREML_DATAREFERENCE_workspaceblobstore', '--workers', '8', '--learning_rate', '0.005', '--epochs', '5', '--anchor_sizes', '16,32,64,128,256,512', '--anchor_aspect_ratios', '0.25,0.5,1.0,2.0', '--rpn_nms_thresh', '0.5', '--box_nms_thresh', '0.3', '--box_score_thresh', '0.1'], 'sourceDirectoryDataStore': None, 'framework': 'Python', 'communicator': 'None', 'target': 'fboyluamlgpuclus', 'dataReferences': {'workspaceblobstore': {'dataStoreName': 'workspaceblobstore', 'mode': 'Mount', 'pathOnDataStore': None, 'pathOnCompute': None, 'overwrite': False}}, 'jobName': None, 'maxRunDurationSeconds': None, 'nodeCount': 1, 'environment': {'name': 'Experiment torchvision Environment', 'version': 'Autosave_2019-06-11T18:

In [1]:
run.wait_for_completion(show_output=True)

## Tune Model Hyperparameters

In [22]:
from azureml.train.dnn import PyTorch

script_params = {
    '--data_path': ds.as_mount(),
    '--workers': 8,
    '--epochs' : 10,
    '--box_nms_thresh' : 0.3,
    '--box_score_thresh' : 0.10    
}

estimator = PyTorch(source_directory=script_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='train.py',
                    use_docker=True,
                    custom_docker_image=image_name,
                    user_managed=True,
                    use_gpu=True)

estimator.run_config.environment.environment_variables["PYTHONPATH"] = "$PYTHONPATH:/cocoapi/PythonAPI/"

framework_version is not specified, defaulting to version 1.1.


In [23]:
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, uniform, choice, HyperDriveConfig, PrimaryMetricGoal

In [24]:
param_sampling = RandomParameterSampling( {
        'learning_rate': uniform(0.0005, 0.005),
        'rpn_nms_thresh': uniform(0.3, 0.7),
        'anchor_sizes': choice('16', '16,32', '16,32,64', '16,32,64,128', '16,32,64,128,256', '16,32,64,128,256,512'),
        'anchor_aspect_ratios': choice('0.25', '0.25,0.5', '0.25,0.5,1.0', '0.25,0.5,1.0,2.0')
    }
)

In [25]:
early_termination_policy = BanditPolicy(slack_factor=0.15, evaluation_interval=2, delay_evaluation=2)

hyperdrive_config = HyperDriveConfig(estimator=estimator,
                                     hyperparameter_sampling=param_sampling, 
                                     policy=early_termination_policy,
                                     primary_metric_name='mAP@IoU=0.50',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=16,
                                     max_concurrent_runs=4)

In [26]:
hyperdrive_run = experiment.submit(hyperdrive_config)

In [29]:
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [None]:
hyperdrive_run.wait_for_completion(show_output=True)

RunId: torchvision_1560447191261
Web View: https://mlworkspace.azure.ai/portal/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/fboyluamlrg/providers/Microsoft.MachineLearningServices/workspaces/fboyluamlws/experiments/torchvision/runs/torchvision_1560447191261

Streaming azureml-logs/hyperdrive.txt

"<START>[2019-06-13T17:33:11.698316][API][INFO]Experiment created<END>\n""<START>[2019-06-13T17:33:12.307271][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2019-06-13T17:33:12.471081][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2019-06-13T17:33:13.7958414Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>
