Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

## 1.0 Connect to Workspace
Initialize a Workspace object from the existing workspace you created in the Prerequisites step

In [None]:
from azureml.core import Workspace

try:
    ws = Workspace.from_config()
    print(ws.name, ws.location, ws.resource_group, ws.location, sep='\t')
    print('Library configuration succeeded')
except:
    print('Workspace not found')

In [None]:
# get the default datastore object
ds = ws.get_default_datastore()

In [None]:
%%writefile train.py

import argparse
import os
import random
import sys
from tempfile import TemporaryDirectory
from azureml.core import Dataset, Run
import pandas as pd
import torch
from seqeval.metrics import classification_report

from utils_nlp.common.pytorch_utils import dataloader_from_dataset
from utils_nlp.common.timer import Timer
from utils_nlp.dataset.ner_utils import preprocess_conll
from utils_nlp.models.transformers.named_entity_recognition import (
    TokenClassificationProcessor, TokenClassifier)


NUM_TRAIN_EPOCHS = 1

# the data path used to save the downloaded data file
DATA_PATH = TemporaryDirectory().name
# the cache data path during find tuning
CACHE_DIR = TemporaryDirectory().name
# set random seeds
RANDOM_SEED = 100
torch.manual_seed(RANDOM_SEED)
# model configurations
model_name = "bert-base-cased"
DO_LOWER_CASE = False
TRAILING_PIECE_TAG = "X"
DEVICE = "cuda"
max_len = 256
BATCH_SIZE = 16



"""get data"""
run = Run.get_context()
workspace = run.experiment.workspace
dataset_name = 'ner_ds_file'
# Get a dataset by name
file_ds = Dataset.get_by_name(workspace=workspace, name=dataset_name)
file_downloads=file_ds.download()

# preprocess conll format
with open(file_downloads[0], "r", encoding="utf8") as file:
    text = file.read()

sentence_list, labels_list = preprocess_conll(text)

processor = TokenClassificationProcessor(model_name=model_name, to_lower=DO_LOWER_CASE, cache_dir=CACHE_DIR)

label_map = TokenClassificationProcessor.create_label_map(
    label_lists=labels_list, trailing_piece_tag=TRAILING_PIECE_TAG
)

train_dataset = processor.preprocess_for_bert(
    text=sentence_list,
    max_len=max_len,
    labels=labels_list,
    label_map=label_map,
    trailing_piece_tag=TRAILING_PIECE_TAG,
)



train_dataloader = dataloader_from_dataset(
    train_dataset, batch_size=BATCH_SIZE, num_gpus=None, shuffle=True, distributed=False
)


# Instantiate a TokenClassifier class for NER using pretrained transformer model
model = TokenClassifier(
    model_name=model_name,
    num_labels=len(label_map),
    cache_dir=CACHE_DIR
)

# Fine tune the model using the training dataset
with Timer() as t:
    model.fit(
        train_dataloader=train_dataloader,
        num_epochs=NUM_TRAIN_EPOCHS,
        num_gpus=None,
        local_rank=-1,
        weight_decay=0.0,
        learning_rate=5e-5,
        adam_epsilon=1e-8,
        warmup_steps=0,
        verbose=True,
        seed=RANDOM_SEED,
    )


#save
torch.save(model.model.state_dict(), 'nlprecipes_bert_ner.model')

# get hold of the current run
run.upload_file("outputs/nlprecipes_bert_ner.model", "nlprecipes_bert_ner.model")

with open(DATA_PATH, "w") as f:
        f.write(json.dumps(label_map))

# get hold of the current run
#Save the label map as json file and load it as dictionary in score script
run.upload_file("outputs/labelfile.txt", DATA_PATH)


## 3.0 Create and Attach Compute for model training
There are two compute options: run once (preview) and persistent computer for this demo we will use persistent compute to learn more about run once compute check out the docs. If VM size STANDARD_NC12S_V2 is not be available in your subscription, use Standard_NC12s_v2 or similar instead.

### *Important*
Run-based creation of Azure Machine Learning compute is currently in Preview. Don't use run-based creation if you use automated hyperparameter tuning or automated machine learning. To use hyperparameter tuning or automated machine learning, create a persistent compute target instead.

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cluster_name = "kmgpu-cluster"

# Verify that cluster does not exist already
try:
    cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_NV12s_v2',
                                                           min_nodes=1,
                                                           max_nodes=4)
    cluster = ComputeTarget.create(ws, cluster_name, compute_config)
    cluster.wait_for_completion(show_output=True)

## 4.0 Create An Experiment

Create an [Experiment](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#experiment/?WT.mc_id=bert-notebook-abornst) to track all the runs in your workspace for this distributed PyTorch tutorial. 

In [None]:
from azureml.core import Experiment
experiment_name = 'bertkmnlp'

exp = Experiment(workspace=ws, name=experiment_name)

In [None]:
# remote env config
PIP_PACKAGES = ["seqeval[gpu]", "torch==1.4", "tqdm==4.31.1", "transformers==2.8.0", "nltk==3.5", "azureml-sdk==1.3.0"]
CONDA_PACKAGES = ["numpy", "scikit-learn", "pandas"]
utils_nlp_file="./nlp-recipes-utils/utils_nlp-2.0.0-py3-none-any.whl"
PYTHON_VERSION = "3.6.8"
USE_GPU = True

In [None]:
# conda env setup
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import ScriptRunConfig
from azureml.core.environment import Environment, DEFAULT_GPU_IMAGE

myenv = Environment(name="myenv")

conda_dependencies = CondaDependencies.create(
    conda_packages=CONDA_PACKAGES,
    pip_packages=PIP_PACKAGES,
    python_version=PYTHON_VERSION,
)

nlp_repo_whl = Environment.add_private_pip_wheel(
    workspace=ws,
    file_path=utils_nlp_file,
    exist_ok=True,
)
#we can also add using the approach mentioned at https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-environments#add-packages-to-an-environment

conda_dependencies.add_pip_package(nlp_repo_whl)

# Adds dependencies to PythonSection of myenv
myenv.python.conda_dependencies=conda_dependencies



# Add training script to run config
runconfig = ScriptRunConfig(source_directory=".", script="train.py" )
# Attach compute target to run config
runconfig.run_config.target = cluster

# Attach environment to run config
runconfig.run_config.environment = myenv
runconfig.run_config.environment.docker.enabled = True

if USE_GPU:
    runconfig.run_config.environment.docker.base_image = DEFAULT_GPU_IMAGE




In [None]:
# Submit run 
run = exp.submit(runconfig)
run.id

In [None]:
%%time
# Shows output of the run on stdout.
run.wait_for_completion(show_output=True)

Note: this experiment typically takes 1-2 hours to complete

## 5.0 Register the Model
Register the model "bertkm_ner" that was created in the last step.

In [None]:
# incase you lost access to the notebook when this model was running for long time
# from azureml.core import Experiment, Run
# experiment_name = 'bertkmnlp'

# exp = Experiment(workspace=ws, name=experiment_name)
# run=Run(exp, "bertkmnlp_XXXXXXXXXXXXXXX", outputs=None)

In [None]:
model = run.register_model(model_name='bertkm_ner', model_path='outputs/nlprecipes_bert_ner.model')
print(model.name, model.id, model.version, sep='\t')

Download the Label Map file which will be used as part of Inferencing script.

In [None]:
run.download_file("outputs/labelfile.txt")

## Next Step

Next we need to deploy the model as a web service. Follow the steps in 03_Deploy_to_AKS.ipynb to deploy the model to  AKS.

Note: You can debug Score script locally. Follow the steps mentioned in 04_Debug_Score_Script.ipynb to develop and debug score script locally. 