In [1]:
from pathlib import Path

from azureml.core import Experiment
from azureml.core import Workspace, Run
from azureml.core import Environment
from azureml.core import Dataset
from azureml.core import ScriptRunConfig

from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

In [2]:
ws = Workspace.from_config()
clean_dataset = Dataset.get_by_name(ws, name='clean', version=1) #v1 is just chems

In [3]:
cluster_name = "embeddings-cluster-gpu-large"
vm_size= "Standard_NC24" #'STANDARD_NC24'

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target')
    compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                            vm_priority='lowpriority',
                                                            max_nodes=1)

    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

Found existing compute target


In [4]:
from azureml.core.environment import Environment
from azureml.core.runconfig import DockerConfiguration

tf_sklearn_env = Environment.from_conda_specification('tf_sklearn', 'tf_sklearn_env.yml')
tf_sklearn_env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu20.04'

docker_config = DockerConfiguration(use_docker=True)

In [5]:
args = ['--data-dir', clean_dataset.as_mount(),
        '--data-file', "clean.parquet",
        '--dense-blocks-no', 1, #more than 4 blows GPU memory
        '--batch-size', 5000, #remember to factor this by no. of gpus eg. batch size 24, with 2 gpus use 24*2 = 48, use slightly increased learning rates for larger batch
        '--epochs', 7,
        '--learning-rate', 0.2,
        '--weight-decay', 1e-3
]

src = ScriptRunConfig(source_directory='.',
                      script='train.py',
                      arguments=args,
                      compute_target=compute_target,
                      environment=tf_sklearn_env,
                      docker_runtime_config=docker_config)

In [6]:
run = Experiment(workspace=ws, name='train_embeddings').submit(src)
run.wait_for_completion(show_output=True)

RunId: train_embeddings_1652800942_a4cd38fe
Web View: https://ml.azure.com/runs/train_embeddings_1652800942_a4cd38fe?wsid=/subscriptions/01491751-2545-4f64-afa7-54ffc907183d/resourcegroups/bnlwe-es01-d-57280-devl-rg/workspaces/bnlwees01d57280-mlops-aml&tid=f66fae02-5d36-495b-bfe0-78a6ff9f8e6e

Streaming azureml-logs/55_azureml-execution-tvmps_3d6d95916fede4752196af875aaf46efad42a2d382e1afb64ff404060180197e_p.txt

2022-05-17T15:23:43Z Successfully mounted a/an Blobfuse File System at /mnt/resource/batch/tasks/shared/LS_root/jobs/bnlwees01d57280-mlops-aml/azureml/train_embeddings_1652800942_a4cd38fe/mounts/workspaceblobstore -- stdout/stderr: 
2022-05-17T15:23:44Z Failed to start nvidia-fabricmanager due to exit status 5 with output Failed to start nvidia-fabricmanager.service: Unit nvidia-fabricmanager.service not found.
. Please ignore this if the GPUs don't utilize NVIDIA® NVLink® switches.
2022-05-17T15:23:44Z Starting output-watcher...
2022-05-17T15:23:44Z IsDedicatedCompute == Fals

{'runId': 'train_embeddings_1652800942_a4cd38fe',
 'target': 'embeddings-cluster-gpu-large',
 'status': 'Completed',
 'startTimeUtc': '2022-05-17T15:23:41.715131Z',
 'endTimeUtc': '2022-05-17T16:57:26.601809Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'amlctrain',
  'ContentSnapshotId': '6d0ca53f-f34d-40d6-9580-f604c169d1e3',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': 'f75d4550-45e8-44ad-99dc-e5cd318f098b'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'input__f75d4550', 'mechanism': 'Mount'}}],
 'outputDatasets': [],
 'runDefinition': {'script': 'train.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--data-dir',
   'DatasetConsumptionConfig:input__f75d4550',
   '--data-file',
   'clean.parquet',
   '--dense-blocks-no',
   '1',
   '--batch-size',
   '5000',
   '--epochs',
   '7',
   '--learning-rate',
   '0.2',
   '--weight-de

In [8]:
model_path = Path('../saved_model')
model_path.mkdir(parents=True, exist_ok=True)
metrics_path = Path('../metrics')
metrics_path.mkdir(parents=True, exist_ok=True)

run.download_files(prefix='outputs/saved_model', output_directory=model_path, append_prefix=False)
run.download_files(prefix='outputs/metrics', output_directory=metrics_path, append_prefix=False)