In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import os
from shutil import copy

import azureml.core
from azureml.core import (Workspace, Experiment, Datastore, Dataset, 
                          ContainerRegistry, ScriptRunConfig, RunConfiguration, 
                          Run)
from azureml.data.datapath import DataPath
from azureml.data.data_reference import DataReference
from azureml.core.runconfig import DataReferenceConfiguration
from azureml.tensorboard import Tensorboard

In [3]:
print('Version of AML: {}'.format(azureml.core.__version__))

Version of AML: 1.0.83


# MegaDetector v4 training

https://docs.microsoft.com/en-us/azure/machine-learning/how-to-set-up-training-targets#amlcompute
    

### Provide credentials

Provide the account name and the key to the storage account, and password to the container registry where the base image is.

In [5]:
storage_account_name = os.environ.get('STORAGE_ACCOUNT_NAME')
storage_account_key = os.environ.get('STORAGE_ACCOUNT_KEY')
registry_pw = os.environ.get('REGISTRY_PASSWORD')

### Connect to the AML workspace

In [6]:
ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, ws.location, sep = '\t')



Performing interactive authentication. Please follow the instructions on the terminal.




Interactive authentication successfully completed.
siyu_sc	southcentralus	yasiyu_rg	southcentralus


In [7]:
compute_target = ws.compute_targets['gpu-nc6-v3']

In [8]:
type(compute_target)

azureml.core.compute.amlcompute.AmlCompute

### Connect to datastore

In [26]:
input_datastore_name = 'tfrecords_mdv4_1'
input_container_name = 'megadetectorv4-1'

# this is actuall a miscellaneous datastore, not used for output currently
output_datastore_name = 'artifacts'
output_container_name = 'megadetectorv4-artifacts'

input_datastore = None
output_datastore = None
for name, ds in ws.datastores.items():
    if name == input_datastore_name:
        input_datastore = ds
    if name == output_datastore_name:
        output_datastore = ds
        
if input_datastore is None:
    'Input datastore {} is not in the workspace; registering it...'.format(input_datastore_name)
    input_datastore = Datastore.register_azure_blob_container(workspace=ws, 
                                             datastore_name=input_datastore_name, 
                                             container_name=input_container_name,
                                             account_name=storage_account_name, 
                                             account_key=storage_account_key,
                                             create_if_not_exists=True)

if output_datastore is None:
    'Output datastore {} is not in the workspace; reigstering it...'.format(output_datastore_name)
    output_datastore = Datastore.register_azure_blob_container(workspace=ws, 
                                             datastore_name=output_datastore_name, 
                                             container_name=output_container_name,
                                             account_name=storage_account_name, 
                                             account_key=storage_account_key,
                                             create_if_not_exists=True)

print(input_datastore)
print(output_datastore)

'Input datastore tfrecords_mdv4_1 is not in the workspace; registering it...'

<azureml.data.azure_storage_datastore.AzureBlobDatastore object at 0x11e442438>
<azureml.data.azure_storage_datastore.AzureBlobDatastore object at 0x11e62c9e8>


In [27]:
input_datastore.container_name

'megadetectorv4-1'

In [28]:
input_data_ref = DataReference(datastore=input_datastore,
                               data_reference_name=input_datastore_name,
                               mode='mount')
output_data_ref = DataReference(datastore=output_datastore,
                               data_reference_name=output_datastore_name,
                               mode='mount')

input_data_ref
output_data_ref

$AZUREML_DATAREFERENCE_tfrecords_mdv4_1

$AZUREML_DATAREFERENCE_artifacts

In [29]:
str(output_data_ref)

'$AZUREML_DATAREFERENCE_artifacts'

In [None]:
# Support for Datasets in custom Docker images is still to come
# input_dataset = Dataset.File.from_files(path=DataPath(datastore=input_datastore))

In [None]:
# input_dataset.as_named_input('tfrecords').as_mount('/tmp')

### Environment setup

In [12]:
container_registry = ContainerRegistry()
container_registry.address = 'yasiyu.azurecr.io'
container_registry.username = 'yasiyu'
container_registry.password = registry_pw

In [13]:
run_config = RunConfiguration()
run_config.environment.docker.enabled = True
run_config.environment.docker.base_image='tfodapi112:190905'
run_config.environment.docker.base_image_registry=container_registry

# GPU support: Azure automatically detects and uses the NVIDIA Docker extension when it is available.

run_config.environment.python.user_managed_dependencies=True  # use your own installed packages instead of an AML created Conda env

run_config.target = compute_target # specify the compute target; obscure error message: `docker image` cannot run

### Experiment

Modify the parameters in the pipeline.config and add notes to tags in this section.

In [14]:
os.getcwd()

'/Users/siyuyang/Source/Repos/GitHub_MSFT/CameraTraps/detection/detector_training'

In [59]:
experiment_name = 'mdv4_1'

exp_folder = 'experiments/megadetector_v4/200203'

config_file_name = 'pipeline_baseline.config'

tags = {
    'model': 'faster_rcnn_inception_resnet_v2_atrous_coco',
    
    'starting_from': 'artifacts/pretrained/faster_rcnn_inception_resnet_v2_atrous_coco_2018_01_28/model.ckpt',
    
    'learning_rate': """cosine_decay_learning_rate {
          learning_rate_base: 0.0003,
          total_steps: 104012,
          warmup_learning_rate: 0.00003,
          warmup_steps: 2000,
          hold_base_rate_steps: 0
        }""",
    
    'augmentations': """baseline + horizontal flip""",
    
    'input_set': 'mdv4box01',
    'train_on': 'train',
    'val_on': 'val'
}

In [60]:
exp = Experiment(workspace=ws, name=experiment_name)

Copy the entry script of TFODAPI to the `source_directory`, which also contains the `pipeline.config`

In [62]:
copy('model_main.py', exp_folder)
copy(os.path.join('experiments/megadetector_v4/', config_file_name), exp_folder)

'experiments/megadetector_v4/200203/model_main.py'

'experiments/megadetector_v4/200203/pipeline_baseline.config'

### Run configuration

In [63]:
run_config.data_references = {
    input_datastore_name: DataReferenceConfiguration(
        datastore_name=input_datastore_name,
        mode='mount'
    ),
    output_datastore_name: DataReferenceConfiguration(
        datastore_name=output_datastore_name,
        mode='mount'
    )
}

config = ScriptRunConfig(
    source_directory=exp_folder,
    script='model_main.py',
    arguments=[
        '--model_dir', './outputs',
        '--pipeline_config_path', config_file_name,
        '--sample_1_of_n_eval_examples', 2  # we are sampling more val set images per eval run, but eval less often
    ],
    run_config=run_config
)

In [64]:
run = exp.submit(config, tags=tags)

In [65]:
run.get_details()['runId']

run.get_status()

'mdv4_1_1585270945_e83cf9b2'

'Starting'

### Retrieve the run later

In [12]:
run = Run(exp, 'mdv4_trial_1580850141_379ed8f0')

In [13]:
run.get_status()

'Running'

AML's TensorBoard requires that the events file be in .log, so not working currently.

In [14]:
# The Tensorboard constructor takes an array of runs
tb = Tensorboard([run])

In [16]:
tb.LOGS_ARTIFACT_PREFIX = 'events/'

In [17]:
tb.start()

In [18]:
# when done, call the stop() method of the Tensorboard object, or it will stay running even after your job completes.
tb.stop()