In [None]:
# For automatic reloading of modified libraries
%reload_ext autoreload
%autoreload 2

# Regular python libraries
import os
import requests
import sys
import json
import statistics

import torch

# AzureML libraries
import azureml
import azureml.core
from azureml.core import Experiment, Workspace, Datastore, ScriptRunConfig
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.container_registry import ContainerRegistry
from azureml.core.runconfig import MpiConfiguration, RunConfiguration, DEFAULT_GPU_IMAGE
from azureml.train.estimator import Estimator
from azureml.widgets import RunDetails

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

## Initialize workspace

To create or access an Azure ML Workspace, you will need to import the AML library and the following information:
* A name for your workspace
* Your subscription id
* The resource group name

Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace you created in the Prerequisites step or create a new one. 

In [None]:
subscription_id = '<subscription_id>'
resource_group = '<resource_group>'
workspace_name = '<workspace_name>'
ws = Workspace(subscription_id, resource_group, workspace_name)
ws_details = ws.get_details()
print('Name:\t\t{}\nLocation:\t{}'
      .format(ws_details['name'],
              ws_details['location']))

In [None]:
from azureml.core import Datastore

ds = ws.get_default_datastore()
#ds = Datastore.get(ws,'default')
#ds = Datastore.get(ws,'workspaceblobstore')
print('Datastore name: ' + ds.name, 
      'Container name: ' + ds.container_name, 
      'Datastore type: ' + ds.datastore_type, 
      'Workspace name: ' + ds.workspace.name, sep = '\n')
# ws.get_default_datastore().container_name
# ws.datastores


In [None]:
# Create the compute cluster
gpu_cluster_name = "<cluster name>"

# Verify that the cluster doesn't exist already
try:
    gpu_compute_target = ComputeTarget(workspace=ws, name=gpu_cluster_name)
    if gpu_compute_target.provisioning_state == 'Failed':
        gpu_compute_target.delete()
        gpu_compute_target.wait_for_completion(show_output=True)
        raise ComputeTargetException('failed cluster')
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_priority='lowpriority' , 
                                                           vm_size='Standard_NC24rs_v3', 
                                                           min_nodes=0, max_nodes=16)
    # ^^^ Change to min_nodes=8 and max_nodes=64 when testing is completed^^^
    
    # create the cluster
    gpu_compute_target = ComputeTarget.create(ws, gpu_cluster_name, compute_config)
    gpu_compute_target.wait_for_completion(show_output=True)

# Use the 'status' property to get a detailed status for the current cluster. 
print(gpu_compute_target.status.serialize())

In [None]:
from azureml.train.dnn import PyTorch
from azureml.core.runconfig import RunConfiguration
from azureml.core.container_registry import ContainerRegistry

run_user_managed = RunConfiguration()
run_user_managed.environment.python.user_managed_dependencies = True

### Germ Eval setup

In [None]:
script_name = 'test.py'
codepath = '.'

from azureml.core import Dataset
from azureml.data import OutputFileDatasetConfig

#create input/output datasets
def ds_input_path(path):
    return Dataset.File.from_files(ds.path(path))
def ds_output_path(path):
        return OutputFileDatasetConfig(destination=(ds, path))

def get_args():
    all_params_default = [
                    '--data.train_filepath', './train_germ/train.tsv',
                    '--data.val_filepath', './val_germ/dev.tsv',
                    '--config_path', 'config_germ.yaml',
                    '--ckpt.model_state_save_dir', './mod_ckpts',
                    '--ckpt.save_dir', './ckpts'
    ]
    return all_params_default

In [None]:
from azureml.core import Environment
myenv = Environment(name="myenv")
myenv.docker.base_image = 'jonrsleep/elr2:latest'
myenv.python.interpreter_path = '/opt/miniconda/envs/elr2/bin/python'
myenv.python.user_managed_dependencies = True

mpi = MpiConfiguration() 
mpi.process_count_per_node = 1 #NC SKU has 4 GPU's per node
mpi.node_count = 1 #scale to the amount of nodes you'd like

config = ScriptRunConfig(source_directory=codepath,
                         script=script_name,
                         arguments = get_args(),
                         compute_target=gpu_compute_target,
                         environment=myenv,
                         distributed_job_config=mpi)

experiment_name = 'marlin_ner_train_plugin_germ'
experiment = Experiment(ws, name=experiment_name)
run = experiment.submit(config)
run.tag('nodes', f'{mpi.node_count}')
run.tag('exp', 'lr 3e-5 ')
print("Submitted run")

In [None]:
# distrib eval test
RunDetails(run).show()

### Model checkpoint modification

In [None]:
import torch
from collections import OrderedDict
state_dict = torch.load('marlin_0.bin', map_location='cpu')

In [None]:
##Modify to point to model 
new_dict = OrderedDict((key.replace('model.',''), value) for key, value in state_dict['module_interface_state'].items() if key.startswith('model.') )
#print(new_dict.keys())
torch.save(new_dict, 'marlin_model.bin')

### Run Inference - modify test.py to remove trainer.train()

In [None]:
script_name = 'test.py'
codepath = '.'

from azureml.core import Dataset
from azureml.data import OutputFileDatasetConfig

#create input/output datasets
def ds_input_path(path):
    return Dataset.File.from_files(ds.path(path))
def ds_output_path(path):
        return OutputFileDatasetConfig(destination=(ds, path))

def get_args():
    all_params_default = [
                    '--data.train_filepath', './train_germ/train.tsv',
                    '--data.val_filepath', './val_germ/dev.tsv',
                    '--config_path', 'config_germ.yaml',
                    '--model.model_path', '< Modify to point to model directory>',
                    '--model.model_file', 'marlin_model.bin'
    ]
    return all_params_default

In [None]:
from azureml.core import Environment
myenv = Environment(name="myenv")
myenv.docker.base_image = 'jonrsleep/elr2:latest'
myenv.python.interpreter_path = '/opt/miniconda/envs/elr2/bin/python'
myenv.python.user_managed_dependencies = True

mpi = MpiConfiguration() 
mpi.process_count_per_node = 1 #NC SKU has 4 GPU's per node
mpi.node_count = 1 #scale to the amount of nodes you'd like

config = ScriptRunConfig(source_directory=codepath,
                         script=script_name,
                         arguments = get_args(),
                         compute_target=gpu_compute_target,
                         environment=myenv,
                         distributed_job_config=mpi)

experiment_name = 'marlin_ner_train_plugin_germ_inf'
experiment = Experiment(ws, name=experiment_name)
run = experiment.submit(config)
run.tag('nodes', f'{mpi.node_count}')
run.tag('exp', 'lr 3e-5 ')
print("Submitted run")