*Copyright (c) Microsoft Corporation. All rights reserved.*

*Licensed under the MIT License.*

# Text Classification of MultiNLI Sentences using BERT with Azure ML Pipelines

In [1]:
import sys
sys.path.append("../../")
import os
import random
import shutil
import pandas as pd

from utils_nlp.bert.common import Language, Tokenizer
from utils_nlp.azureml import azureml_utils
from utils_nlp.dataset.multinli import get_generator

from sklearn.preprocessing import LabelEncoder
from azureml.core import Datastore, Experiment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration
from azureml.core.compute import ComputeTarget
from azureml.exceptions import ComputeTargetException
from azureml.data.data_reference import DataReference
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.widgets import RunDetails
from azureml.train.dnn import PyTorch
from azureml.core.runconfig import MpiConfiguration
from azureml.pipeline.steps import EstimatorStep

## 0. Introduction

In this notebook, we fine-tune and evaluate a pretrained BERT model on a subset of the MultiNLI dataset by using Azure ML Pipelines.

In [2]:
LABEL_COL = "genre"
TEXT_COL = "sentence1"
DATA_FOLDER = "../../data/temp"
TRAIN_FOLDER = "../../data/temp/train"
TEST_FOLDER = "../../data/temp/test"
BERT_CACHE_DIR = "../../data/temp"
LANGUAGE = Language.ENGLISH
TO_LOWER = True
MAX_LEN = 150
BATCH_SIZE = 32
NUM_GPUS = 2
NUM_EPOCHS = 1
TRAIN_SIZE = 0.6
TEXT_COL = "sentence1"
ENCODED_LABEL_COL = "label"
TOKEN_COL = "tokens"
MASK_COL = "mask"
NUM_BATCHES = 100
LABELS = ['telephone', 'government', 'travel', 'slate', 'fiction']

In this example we will use AzureML pipelines to execute our training pipelines. Each preprocessing step is included as a step in the pipeline. For a more detailed walkthrough of what pipelines are with a getting started guidelines check this [notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-getting-started.ipynb). We start by doing some AzureML related setup below

### 0.1 Create a workspace

First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. This will create a config.json file containing the values needed below to create a workspace.

**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook

In [3]:
ws = azureml_utils.get_or_create_workspace(
    subscription_id="<SUBSCRIPTION_ID>",
    resource_group="<RESOURCE_GROUP>",
    workspace_name="<WORKSPACE_NAME>",
    workspace_region="<WORKSPACE_REGION>",
)

Performing interactive authentication. Please follow the instructions on the terminal.




Interactive authentication successfully completed.


### 0.2 Setup experiment and logging

In [4]:
# Make a folder for the project
project_folder = "../../"

# Set up an experiment
experiment_name = "pipelines-tc"
experiment = Experiment(ws, experiment_name)

# Add logging to our experiment
run = experiment.start_logging()

### 0.3 Create a compute target

In [5]:
# choose your cluster
cluster_name = "pipelines-tc-12"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print("Found existing compute target.")
except ComputeTargetException:
    print("Creating a new compute target...")
    compute_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_NC12", max_nodes=24
    )

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current AmlCompute.
print(compute_target.get_status().serialize())

Found existing compute target.
{'currentNodeCount': 2, 'targetNodeCount': 2, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 2, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-15T23:14:54.400000+00:00', 'errors': None, 'creationTime': '2019-07-11T23:45:19.957009+00:00', 'modifiedTime': '2019-07-11T23:45:42.001270+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 2, 'maxNodeCount': 24, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC12'}


## 1. Preprocessing

The pipeline is defined by a series of steps, the first being a PythonScriptStep which utilizes [DASK](https://dask.org/) to load dataframes in batches allowing us to load and preprocess different sets of data in parallel.

### 1.1 Read Dataset

In [None]:
batches = get_generator(DATA_FOLDER, "train", num_batches=NUM_BATCHES, batch_size=10e7)

### 1.2 Preprocess and Tokenize

In the classification task, use the first sentence only as the text input, and the corresponding genre as the label. Select the examples corresponding to one of the entailment labels (*neutral* in this case) to avoid duplicate rows, as the sentences are not unique, whereas the sentence pairs are.

Once filtered, we encode the labels. To do this, fit a label encoder with the know labels in a MNLI dataset.

In [None]:
if not os.path.exists(TRAIN_FOLDER):
    os.makedirs(TRAIN_FOLDER)
if not os.path.exists(TEST_FOLDER):
    os.makedirs(TEST_FOLDER)

labels = LABELS
label_encoder = LabelEncoder()
label_encoder.fit(labels)

i=0
for batch in batches:
    batch = batch[batch["gold_label"]=="neutral"]
    batch[ENCODED_LABEL_COL] = label_encoder.transform(batch[LABEL_COL])
    
    if i<0.8*NUM_BATCHES:
        batch.to_csv(TRAIN_FOLDER+"/batch{}.csv".format(str(i)))
    else:
        batch.to_csv(TEST_FOLDER+"/batch{}.csv".format(str(i)))
    i += 1

Once we have batches of data ready they are uploaded to the datastore.

In [None]:
ds = ws.get_default_datastore()
ds.as_mount()
ds.upload(src_dir=TRAIN_FOLDER, target_path="mnli_data/train", overwrite=True, show_progress=False)
ds.upload(src_dir=TEST_FOLDER, target_path="mnli_data/test", overwrite=True, show_progress=False)

In [None]:
#shutil.rmtree(TRAIN_FOLDER)
#shutil.rmtree(TEST_FOLDER)

We can now parallely operate on each batch to tokenize the data and preprocess the tokens. To do this, we create a PythonScript step below.

In [16]:
%%writefile ../../utils_nlp/bert/preprocess.py
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import argparse
import logging
import os

import pandas as pd

from utils_nlp.bert.common import Language, Tokenizer

LABEL_COL = "genre"
TEXT_COL = "sentence1"
LANGUAGE = Language.ENGLISH
TO_LOWER = True
MAX_LEN = 150

logger = logging.getLogger(__name__)


def tokenize(df):
    """Tokenize the text documents and convert them to lists of tokens using the BERT tokenizer.
    Args:
        df(pd.Dataframe): Dataframe with training or test samples

    Returns:

        list: List of lists of tokens for train set.

    """
    tokenizer = Tokenizer(
        LANGUAGE, to_lower=TO_LOWER)
    tokens = tokenizer.tokenize(list(df[TEXT_COL]))

    return tokens


def preprocess(tokens):
    """ Preprocess method that does the following,
            Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary
            Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence
            Pad or truncate the token lists to the specified max length
            Return mask lists that indicate paddings' positions
            Return token type id lists that indicate which sentence the tokens belong to (not needed
            for one-sequence classification)

    Args:
        tokens(pd.Dataframe): Dataframe with tokens for train set.

    Returns:
        list: List of lists of tokens for train or test set with special tokens added.
        list: Input mask.
    """
    tokenizer = Tokenizer(
        LANGUAGE, to_lower=TO_LOWER)
    tokens, mask, _ = tokenizer.preprocess_classification_tokens(
        tokens, MAX_LEN
    )

    return tokens, mask


parser = argparse.ArgumentParser()
parser.add_argument("--input_data", type=str, help="input data")
parser.add_argument("--output_data", type=str, help="output data directory")
parser.add_argument("--output_filename", type=str, help="output file name")

args = parser.parse_args()
input_data = args.input_data
output_data = args.output_data

if output_data is not None:
    print(output_data)
    os.makedirs(output_data, exist_ok=True)
    logger.info("%s created" % output_data)

df = pd.read_csv(args.input_data)
tokens_array = tokenize(df)
tokens_array, mask_array = preprocess(tokens_array)

df['tokens'] = tokens_array
df['mask'] = mask_array

# Filter columns
cols = ['tokens', 'mask', 'label']
df = df[cols]
df.to_csv(os.path.join(args.output_data, "output_filename"))
logger.info("Completed")


Overwriting ../../utils_nlp/bert/preprocess.py


Create a conda environment for the steps below.

In [17]:
conda_dependencies = CondaDependencies.create(
    conda_packages=[
        "numpy",
        "scikit-learn",
        "pandas",
    ],
    pip_packages=["azureml-sdk==1.0.43.*", 
                  "torch==1.1", 
                  "tqdm==4.31.1",
                 "pytorch-pretrained-bert>=0.6"],
    python_version="3.6.8",
)
run_config = RunConfiguration(conda_dependencies=conda_dependencies)
run_config.environment.docker.enabled = True

Then create the list of steps that use the preprocess.py created above. Add these steps into a pipeline and validate it to ensure there are no errors.

In [34]:
steps = []
ds = ws.get_default_datastore()

train_dir = PipelineData(name="train_data", datastore=ds,
                           output_path_on_compute='mnli_data/processed_train')

test_dir = PipelineData(name="test_dir", datastore=ds,
                        output_path_on_compute='mnli_data/processed_test')

for i in range(2):
        if i < 1:
            input_data = DataReference(datastore=ds, 
                                       data_reference_name='batch_{}'.format(str(i)), 
                                       path_on_datastore='mnli_data/train/batch1.csv',
                                       overwrite=False)
            train_dir = PipelineData(name="train_data", datastore=ds,
                           output_path_on_compute='mnli_data/processed_train')
            output_data = train_dir
        else:
            input_data = DataReference(datastore=ds, 
                                       data_reference_name='batch_{}'.format(str(i)), 
                                       path_on_datastore='mnli_data/test/batch82.csv',
                                       overwrite=False)
            test_dir = PipelineData(name="test_dir", datastore=ds,
                        output_path_on_compute='mnli_data/processed_test')
            output_data = test_dir
            
        step = PythonScriptStep(
            name='preprocess_step_{}'.format(str(i)),
            arguments=["--input_data", input_data, 
                       "--output_data", output_data, 
                       "--output_filename", 'batch{}.csv'.format(str(i))],
            script_name= "utils_nlp/bert/preprocess.py",
            inputs=[input_data],
            outputs=[output_data],
            source_directory=project_folder,
            compute_target=compute_target,
            runconfig=run_config,
            allow_reuse=False,
        )
        
        steps.append(step)

pipeline = Pipeline(workspace=ws, steps=steps)
pipeline.validate()

Data reference batch_0 is ready to be created [96cfa6c7], (Consumers of this data will generate new runs.)
Data reference batch_1 is ready to be created [7f84ffda], (Consumers of this data will generate new runs.)


[]

In [35]:
pipeline_run1 = Experiment(ws, 'Preprocessing-MNLI').submit(pipeline, regenerate_outputs=False)
print("Pipeline is submitted for execution")

Created step preprocess_step_0 [dd07459a][b183cc73-1fb1-42e7-b646-84cd56c333fe], (This step will run and generate new outputs)
Created step preprocess_step_1 [9adec2b1][c574ff86-8a63-4e42-96d7-5ccc1d27909b], (This step will run and generate new outputs)
Created data reference batch_0 for StepId [96cfa6c7][a79c0d54-1eb9-43cc-9ca6-f7f4cf335311], (Consumers of this data will generate new runs.)
Created data reference batch_1 for StepId [7f84ffda][6911f16a-5584-40a5-a278-1a8a82a68931], (Consumers of this data will generate new runs.)
Submitted pipeline run: 5c872ce2-8c2a-4f55-ac20-5a11477d214b
Pipeline is submitted for execution


In [36]:
RunDetails(pipeline_run1).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…

In [None]:
# ToDo: Clean up local preprocess file

## 2. Training

Once the data is processed and available on datastore, we  train the classifier using the training examples. This involves fine-tuning the BERT Transformer and learning a linear classification layer on top of that. 

The training is distributed and is done AzureML's capability to support distributed using MPI with horovod. 

### 2.1 Setup training script

In [37]:
%%writefile ../../utils_nlp/bert/train.py


# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import os
import logging
import argparse
import pickle

from sklearn.metrics import classification_report

from utils_nlp.bert.common import Language
from utils_nlp.bert.sequence_classification_distributed import BERTSequenceDistClassifier
from utils_nlp.common.timer import Timer

BATCH_SIZE = 32
NUM_GPUS = 2
NUM_EPOCHS = 1
LABELS = ["telephone", "government", "travel", "slate", "fiction"]

logger = logging.getLogger(__name__)


parser = argparse.ArgumentParser()
parser.add_argument("--input_train_dir", type=str, help="Training data")
parser.add_argument("--input_test_dir", type=str, help="Test data")
parser.add_argument("--result_dir", type=str, help="Results directory containing confidence report")
parser.add_argument("--result_file",type=str, help="File name for confidence report")

args = parser.parse_args()
train_dir = args.input_train_dir
test_dir = args.input_test_dir
result_dir = args.result_dir
result_file = args.result_file

if result_dir is not None:
    os.makedirs(result_dir, exist_ok=True)
    logger.info("%s created" % result_dir)

# Train
classifier = BERTSequenceDistClassifier(
    language=Language.ENGLISH, num_labels=len(LABELS)
)
with Timer() as t:
    classifier.fit(
        train_dir,
        num_gpus=NUM_GPUS,
        num_epochs=NUM_EPOCHS,
        batch_size=BATCH_SIZE,
        verbose=True,
    )
logger.info("Training Time {}".format(t.interval / 3600))

# Predict
preds, labels_test = classifier.predict(
    test_dir, num_gpus=NUM_GPUS, batch_size=BATCH_SIZE
)
data = classification_report(labels_test, preds, target_names=LABELS)
with open(os.path.join(result_dir, result_file), 'wb') as fp:
    pickle.dump(data, fp, protocol=pickle.HIGHEST_PROTOCOL)


Overwriting ../../utils_nlp/bert/train.py


### 2.2 Create a Pytorch Estimator

We create a Pytorch Estimator using AzureML SDK and additonally define an EstimatorStep to run it on AzureML pipelines.

In [38]:
ds = ws.get_default_datastore()

'''
train_dir = DataReference(datastore=ds, 
                           data_reference_name='train_data', 
                                       path_on_datastore='mnli_data/processed_train',
                                       overwrite=False)

test_dir = DataReference(datastore=ds, 
                           data_reference_name='test_data', 
                                       path_on_datastore='mnli_data/processed_test',
                                       overwrite=False)
'''

result_dir = PipelineData(name="results", 
                 datastore=ds,
                 output_path_on_compute='mnli_data/results')
result_file = 'result.p'

In [39]:
script_params = {
    '--input_train_dir': train_dir,
    '--input_test_dir' : test_dir}

estimator = PyTorch(source_directory=project_folder,
                    compute_target=compute_target,
                    entry_script='utils_nlp/bert/train.py',
                    node_count=2,
                    process_count_per_node=1,
                    distributed_training=MpiConfiguration(),
                    use_gpu=True,
                    conda_packages=['scikit-learn=0.20.3', 'numpy>=1.16.0', 'pandas'],
                    pip_packages=["tqdm==4.31.1","pytorch-pretrained-bert>=0.6"]
                   )



In [40]:
est_step = EstimatorStep(name="Estimator-Train", 
                         estimator=estimator, 
                         estimator_entry_script_arguments=[
                             '--input_train_dir', train_dir,
                             '--input_test_dir' , test_dir,
                             '--result_dir', result_dir,
                             '--result_file', result_file],
                         inputs =[train_dir, test_dir],
                         outputs =[result_dir],
                         runconfig_pipeline_params=None, 
                         compute_target=compute_target)

In [41]:
pipeline = Pipeline(workspace=ws, steps=[est_step])
pipeline.validate()

Step Estimator-Train is ready to be created [8129dbe1]


[]

In [43]:
pipeline_run = Experiment(ws, 'TC-Training-BERT').submit(pipeline)

Created step Estimator-Train [8129dbe1][f3b7072d-71ff-4270-8ff0-5dfa6a95c4c2], (This step will run and generate new outputs)
Created step preprocess_step_0 [fa5967c4][b183cc73-1fb1-42e7-b646-84cd56c333fe], (This step will run and generate new outputs)
Created step preprocess_step_1 [0470ec39][c574ff86-8a63-4e42-96d7-5ccc1d27909b], (This step will run and generate new outputs)
Using data reference batch_0 for StepId [5d90d30f][a79c0d54-1eb9-43cc-9ca6-f7f4cf335311], (Consumers of this data are eligible to reuse prior runs.)
Using data reference batch_1 for StepId [ae824316][6911f16a-5584-40a5-a278-1a8a82a68931], (Consumers of this data are eligible to reuse prior runs.)
Submitted pipeline run: 17e25a9d-7ba0-487d-84b3-f84a792ef997


In [44]:
RunDetails(pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', '…