In [1]:
# !pip install azureml
# !pip install azureml-core --user
# !pip install azureml.widgets
#!pip install azureml.dataprep

In [1]:
# For automatic reloading of modified libraries
%reload_ext autoreload
%autoreload 2

# Regular python libraries
import os
import requests
import sys
import json
import statistics

import torch

# AzureML libraries
import azureml
import azureml.core
from azureml.core import Experiment, Workspace, Datastore, ScriptRunConfig
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.container_registry import ContainerRegistry
from azureml.core.runconfig import MpiConfiguration, RunConfiguration, DEFAULT_GPU_IMAGE
from azureml.widgets import RunDetails

# Check core SDK version number
#print("SDK version:", azureml.

In [2]:
subscription_id = '42ae47bd-b19b-42c1-b0b9-19fd5be9d51b'
resource_group = 'bert-base'
workspace_name = 'SubstrateIntelligenceNLR-WS2'
ws = Workspace(subscription_id, resource_group, workspace_name)
ws_details = ws.get_details()
print('Name:\t\t{}\nLocation:\t{}'
      .format(ws_details['name'],
              ws_details['location']))

Name:		SubstrateIntelligenceNLR-WS2
Location:	eastus


In [3]:
from azureml.core import Datastore
ds = Datastore.register_azure_blob_container(workspace=ws, 
                                             datastore_name='default',
                                             container_name='azureml-blobstore-d6fc2475-ad02-44a7-90ff-88a2a91e66b1',
                                             account_name='substrateintel3704284680', 
                                             account_key = 'replaceme',
                                             create_if_not_exists=True
                                            )

print('Datastore name: ' + ds.name, 
      'Container name: ' + ds.container_name, 
      'Datastore type: ' + ds.datastore_type, 
      'Workspace name: ' + ds.workspace.name, sep = '\n')

Datastore name: default
Container name: azureml-blobstore-d6fc2475-ad02-44a7-90ff-88a2a91e66b1
Datastore type: AzureBlob
Workspace name: SubstrateIntelligenceNLR-WS2


## Compute

In [4]:
gpu_cluster_name = "sriovdedicated1"
gpu_compute_target = ComputeTarget(workspace=ws, name=gpu_cluster_name)
print(gpu_compute_target.status.serialize())

{'currentNodeCount': 9, 'targetNodeCount': 9, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 9, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-01-22T10:27:14.096000+00:00', 'errors': None, 'creationTime': '2020-09-08T21:22:56.219502+00:00', 'modifiedTime': '2021-01-22T08:44:17.163555+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 9, 'maxNodeCount': 16, 'nodeIdleTimeBeforeScaleDown': 'PT1200S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC24RS_V3'}


## Bart seq to seq

In [5]:
script_name = 'train.py'
codepath = '.'

from azureml.core import Dataset
from azureml.data import OutputFileDatasetConfig

#create input/output datasets
def ds_path(path):
    try:
        return Dataset.File.from_files(ds.path(path))
    except Exception as e:
        print(f'Using {path} as output')
        return OutputFileDatasetConfig(destination=(ds, path))

processes = 1
op = 'preprocess'
def get_args():
    all_params_default = [
                    '--data_path', ds_path(f'krishan/bart/cnn_dm').as_download(),
                    '--config_path', 'config-prod.yaml',
                    '--tmgr.gpu_batch_size_limit',8,
                    '--dist',
                    '--chkp.save_dir', ds_path(f'krishan/bart/ckpts/cnndm_sum').as_mount(),
    ]
    return all_params_default

In [6]:
print(get_args())

['--data_path', <azureml.data.dataset_consumption_config.DatasetConsumptionConfig object at 0x000001B95B0066A0>, '--config_path', 'config-prod.yaml', '--tmgr.gpu_batch_size_limit', 8, '--dist', '--chkp.save_dir', <azureml.data.dataset_consumption_config.DatasetConsumptionConfig object at 0x000001B95AFF8700>]


In [7]:
from azureml.core import Environment
myenv = Environment(name="myenv")

# Creates the environment inside a Docker container.
myenv.docker.enabled = True
myenv.docker.base_image = 'krishansubudhi/marlin:latest'
myenv.python.interpreter_path = '/opt/miniconda/envs/marlin/bin/python'
myenv.python.user_managed_dependencies = True

In [8]:
mpi = MpiConfiguration() 
mpi.process_count_per_node = 4 #NC SKU has 4 GPU's per node
mpi.node_count = 1 #scale to the amount of nodes you'd like

In [9]:
config = ScriptRunConfig(source_directory=codepath,
                         script=script_name,
                         arguments = get_args(),
                         compute_target=gpu_compute_target,
                         environment=myenv,
                         distributed_job_config=mpi)

experiment_name = 'marlin_bart_seq2seqft'
experiment = Experiment(ws, name=experiment_name)

run = experiment.submit(config)

run.tag('nodes', f'{mpi.node_count}')
print("Submitted run")

Submitted run


In [10]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
marlin_bart_seq2seqft,marlin_bart_seq2seqft_1611786404_8262c574,azureml.scriptrun,Preparing,Link to Azure Machine Learning studio,Link to Documentation
