In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr’

%load_ext autoreload
%autoreload 2

In [None]:
import os
from shutil import copy

import azureml.core
from azureml.core import (Workspace, Experiment, Datastore, Dataset, 
                          ContainerRegistry, ScriptRunConfig, RunConfiguration, 
                          Run)
from azureml.train.dnn import PyTorch
from azureml.data.data_reference import DataReference
from azureml.core.runconfig import DataReferenceConfiguration
from azureml.tensorboard import Tensorboard

In [None]:
print('Version of AML: {}'.format(azureml.core.__version__))

# Run training on AML
We used this notebook to run experiments on [Azure Machine Learning](https://azure.microsoft.com/en-us/services/machine-learning-services/) earlier in the project, and since switched to running experiments on [Data Science Virtual Machines](https://azure.microsoft.com/en-us/services/virtual-machines/data-science-virtual-machines/). This notebook is no longer maintained but is kept here as an example of using the AML Python SDK. 

### Provide credentials

Provide the account name and the key to the storage account

In [None]:
storage_account_name = os.environ.get('STORAGE_ACCOUNT_NAME')
storage_account_key = os.environ.get('STORAGE_ACCOUNT_KEY')

### Connect to the AML workspace

In [None]:
ws = Workspace.get(name='<workspace_name>', 
                   subscription_id='<subscription_id>', 
                   resource_group='<resource_group_name>')
print(ws.name, ws.location, ws.resource_group, ws.location, sep = '\t')

In [None]:
compute_target = ws.compute_targets['gpu-nc6']

### Connect to datastore

In [None]:
datastore_name = 'wcsorinoquia'
container_name = 'wcs-orinoquia'

datastore = None
for name, ds in ws.datastores.items():
    if name == datastore_name:
        datastore = ds
        
if datastore is None:
    datastore = Datastore.register_azure_blob_container(
         workspace=ws, 
         datastore_name=datastore_name, 
         container_name=container_name,
         account_name=storage_account_name, 
         account_key=storage_account_key,
         create_if_not_exists=True)

In [None]:
data_ref = DataReference(datastore=datastore,
                         data_reference_name=datastore_name,
                         mode='mount')

In [None]:
str(data_ref)

### Create an AML experiment and run configuration

In [None]:
experiment_name = 'wcs_baseline_20200506'

exp_folder = './scripts_and_config'

tags = {
    'model': 'unet, feature scale 2',
    
    'starting_from': 'None',
    
    'init_learning_rate': str(1e-4),
    
    'loss_weights': 'all the same, set to 1',
    
    'batch_size': '32',
    
    'imagery': 'full_sr_median_2013_2014',
    
    'bands': '2, 3, 6, 7, NDVI'
}

In [None]:
os.getcwd()

In [None]:
copy('../viz_utils.py', exp_folder)

# copytree requires that the destination folder must not already exist
os.makedirs(os.path.join(exp_folder, 'constants'), exist_ok=True)
os.makedirs(os.path.join(exp_folder, 'constants', 'class_lists'), exist_ok=True)
os.makedirs(os.path.join(exp_folder, 'constants', 'splits'), exist_ok=True)

copy('../constants/landsat_bands_info.py', os.path.join(exp_folder, 'constants'))
copy('../constants/class_lists/lulc_wcs_label_maps.json', os.path.join(exp_folder, 'constants', 'class_lists'))
copy('../constants/splits/full_sr_median_2013_2014_splits.json', os.path.join(exp_folder, 'constants', 'splits'))

In [None]:
exp = Experiment(workspace=ws, name=experiment_name)

In [None]:
PyTorch.get_supported_versions()
PyTorch.DEFAULT_VERSION

In [None]:
script_params = {
    '--config_module_path': 'experiments.baseline.baseline_config'
}

pt_est = PyTorch(
    source_directory=exp_folder,
    script_params=script_params,
    entry_script='train.py',  # relative to source_directory
    
    inputs=[data_ref],
    
    compute_target=compute_target,
    node_count=1,
    use_gpu=True,
    
    # framework_version='1.3.1',  # this version gets used, but can't specify it
    
    pip_packages=['pillow==6.1', 'tensorflow==1.14.0', 
                 'numpy', 'pandas', 'matplotlib', 
                  'geopandas', 'rasterio', 'scikit-image',
                 ]

# Both of the following did not work (using conda does not work) 
# - couldn't import rasterio.windows.Window if using conda_packages instead of pip_packages

# conda_dependencies_file_path='training_environment.yml'

#     conda_packages=['numpy', 'pandas', 'matplotlib', 
#                     'geopandas', 'rasterio', 'scikit-image',
#                     'tensorflow==1.14.0', 'pillow==6.1']
)

In [None]:
run = exp.submit(pt_est, tags=tags)

In [None]:
run.get_details()['runId']
run.get_status()

### To archive an Experiment

In [None]:
exp = Experiment(workspace=ws, name='name_of_exp_to_archive')
exp.archive()

### Start TensorBoard

https://docs.microsoft.com/bs-latn-ba/azure/machine-learning/service/how-to-monitor-tensorboard

We wrote logs to ./logs, which AML uploads to Artifact Service and makes available to a TensorBoard instance.

In [None]:
# recover the Run object if needed
run_id = 'wcs_baseline_<run_id>'  # get the run_id from above cell or from Azure Portal
run = Run(exp, run_id)

In [None]:
# The Tensorboard constructor takes an array of runs
tb = Tensorboard([run])

# If successful, start() returns a string with the URI of the instance.
tb.start()

In [None]:
# when done, call the stop() method of the Tensorboard object, or it will stay running even after your job completes.
tb.stop()

The TensorBoard stops loading after a little while and needs to be restarted...