# Multi-lingual Inference on XNLI Dataset using BERT

## Summary
In this notebook, we demostrate using the [Multi-lingual BERT model](https://github.com/google-research/bert/blob/master/multilingual.md) to do language inference in Chinese and Hindi. We use the [XNLI](https://github.com/facebookresearch/XNLI) dataset and the task is to classify sentence pairs into three classes: contradiction, entailment, and neutral.   
The figure below shows how [BERT](https://arxiv.org/abs/1810.04805) classifies sentence pairs. It concatenates the tokens in each sentence pairs and separates the sentences by the [SEP] token. A [CLS] token is prepended to the token list and used as the aggregate sequence representation for the classification task.
<img src="https://nlpbp.blob.core.windows.net/images/bert_two_sentence.PNG">

In [1]:
import sys
import os
import random
import torch
import shutil

nlp_path = os.path.abspath('../../')
if nlp_path not in sys.path:
    sys.path.insert(0, nlp_path)
from utils_nlp.azureml.azureml_utils import get_or_create_workspace

import azureml.core
from azureml.core.workspace import Workspace
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Datastore
import azureml.data

from azureml.data.azure_storage_datastore import AzureFileDatastore
from azureml.train.dnn import PyTorch
from azureml.core.runconfig import MpiConfiguration
from azureml.core import Experiment
from azureml.widgets import RunDetails

## Configurations
Note that the running time shown in this notebook are on a Standard_NC12 Azure Deep Learning Virtual Machine with two NVIDIA Tesla K80 GPUs. If you want to run through the notebook quickly, you can change the `TRAIN_DATA_USED_PERCENT` to a small number, e.g. 0.01. 

In [2]:

# azureml configuration
AZUREML_VERBOSE = True
cluster_name = "eval-gpu"  # Name of AzureML Compute Target cluster

# debug flag
DEBUG = True

In [3]:
# Let's load the workspace from the configuration file
ws = Workspace.from_config()
print("Workspace was loaded successfully from the configuration file")

Workspace was loaded successfully from the configuration file


In [None]:
'''
ws = get_or_create_workspace(
    subscription_id="<SUBSCRIPTION_ID>",
    resource_group="<RESOURCE_GROUP>",
    workspace_name="<WORKSPACE_NAME>",
    workspace_region="<WORKSPACE_REGION>",
)


print("Workspace name: {}".format(ws.name))
print("Resource group: {}".format(ws.resource_group))

'''

In [4]:
print("Workspace name: {}".format(ws.name))
print("Resource group: {}".format(ws.resource_group))

Workspace name: MAIDAPTest
Resource group: nlprg


In [5]:
try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print("Found compute target: {}".format(cluster_name))
except ComputeTargetException:
    print("Creating new compute target: {}".format(cluster_name))
    compute_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_NC6", max_nodes=1
    )
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
    compute_target.wait_for_completion(show_output=True)

if AZUREML_VERBOSE:
    print(compute_target.get_status().serialize())

Found compute target: eval-gpu
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-09T16:55:15.003000+00:00', 'errors': None, 'creationTime': '2019-06-25T18:13:14.313025+00:00', 'modifiedTime': '2019-06-25T18:13:30.200677+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}


import numpy as np
from utils_nlp.bert.common import Language, Tokenizer
from sklearn.preprocessing import LabelEncoder

# model configurations
LANGUAGE = Language.ENGLISH
TO_LOWER_CASE = True
MAX_SEQ_LENGTH = 128

# data configs
TEXT_COL = "text"
LABEL_COL = "label"


print("Create a tokenizer...")
tokenizer= Tokenizer(language=LANGUAGE, to_lower=TO_LOWER_CASE, cache_dir=CACHE_DIR)
train_tokens = tokenizer.tokenize(train_df[TEXT_COL])

print("Tokenize and preprocess text...")
#tokenize
train_token_ids, train_input_mask, train_token_type_ids = \
tokenizer.preprocess_classification_tokens(train_tokens, max_len=MAX_SEQ_LENGTH)

#preprocess
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df[LABEL_COL])
num_labels = len(np.unique(train_labels))

type(train_labels)

input_ids_tensor = torch.tensor(input_ids, dtype=torch.long)
input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
#label_ids_tensor = torch.tensor(label_ids, dtype=torch.long)

tensor_data = TensorDataset(input_ids_tensor, input_mask_tensor)
batch_size = 16
DataLoader(tensor_data, sampler, batch_size)

type(train_df[LABEL_COL])

from utils_nlp.bert.common import create_data_loader
BATCH_SIZE = 16
bert_dl = create_data_loader(train_token_ids, train_input_mask,None,  "random", batch_size=BATCH_SIZE)


bert_dl

In [2]:
#add if makedirs
from utils_nlp.dataset.xnli import load_pandas_df
TRAIN_DATA_USED_PERCENT = 0.01
CACHE_DIR = "../../temp1"
print("load data...")
train_df = load_pandas_df(local_cache_path=CACHE_DIR, file_split="train", language="en")
print("English training dataset size: {}".format(train_df.shape[0]))
print(train_df.head())
train_data_used_count = round(TRAIN_DATA_USED_PERCENT * train_df.shape[0])
train_df = train_df.loc[:train_data_used_count]


load data...


100%|██████████████████████████████████████████████████████████████████████████████| 455k/455k [02:45<00:00, 2.75kKB/s]


English training dataset size: 392702
                                                text       label
0  (Conceptually cream skimming has two basic dim...     neutral
1  (you know during the season and i guess at at ...  entailment
2  (One of our number will carry out your instruc...  entailment
3  (How do you know ? All this is their informati...  entailment
4  (yeah i tell you what though if you go price s...     neutral


In [18]:
train_df.head()


Unnamed: 0,text,label
0,(Conceptually cream skimming has two basic dim...,neutral
1,(you know during the season and i guess at at ...,entailment
2,(One of our number will carry out your instruc...,entailment
3,(How do you know ? All this is their informati...,entailment
4,(yeah i tell you what though if you go price s...,neutral


In [4]:
from utils_nlp.bert.common import Language, Tokenizer
from sklearn.preprocessing import LabelEncoder

# model configurations
LANGUAGE = Language.ENGLISH
TO_LOWER_CASE = True
MAX_SEQ_LENGTH = 128

# data configs
TEXT_COL = "text"
LABEL_COL = "label"

print("Create a tokenizer...")
tokenizer= Tokenizer(language=LANGUAGE, to_lower=TO_LOWER_CASE, cache_dir=CACHE_DIR)
train_tokens = tokenizer.tokenize(train_df[TEXT_COL])

print("Tokenize and preprocess text...")
#tokenize
train_token_ids, train_input_mask, train_token_type_ids = \
tokenizer.preprocess_classification_tokens(train_tokens, max_len=MAX_SEQ_LENGTH)

#preprocess
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df[LABEL_COL])
#num_labels = len(np.unique(train_labels))

Create a tokenizer...


100%|██████████████████████████████████████████████████████████████████████| 231508/231508 [00:00<00:00, 1616071.48B/s]
100%|████████████████████████████████████████████████████████████████████████████| 3928/3928 [00:01<00:00, 1986.68it/s]


Tokenize and preprocess text...


In [5]:
len(train_token_ids)

3928

In [6]:
len(train_input_mask[0])

128

In [7]:
len(train_token_type_ids[0])

128

In [42]:
import numpy as np
#len(train_labels)
print(train_labels[0])

2


In [11]:
project_dir = "./entailment_aml"
if DEBUG and os.path.exists(project_dir): 
    shutil.rmtree(project_dir) 
shutil.copytree("../../utils_nlp", os.path.join(project_dir, "utils_nlp"))

'./entailment_aml\\utils_nlp'

In [15]:
datastore_name = "workspacefilestore"
ds = ws.datastores[datastore_name]

# Upload files
ds.upload(src_dir="../../temp", target_path="./entailment_aml", overwrite=True, show_progress=True)


Uploading ../../temp\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
Uploading ../../temp\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.json
Uploading ../../temp\XNLI-MT-1.0.zip
Uploading ../../temp\XNLI-MT-1.0\.DS_Store
Uploaded ../../temp\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.json, 1 files out of an estimated total of 42
Uploading ../../temp\XNLI-MT-1.0\multinli\multinli.train.ar.tsv
Uploading ../../temp\XNLI-MT-1.0\multinli\multinli.train.bg.tsv
Uploading ../../temp\XNLI-MT-1.0\multinli\multinli.train.de.tsv
Uploading ../../temp\XNLI-MT-1.0\multinli\multinli.train.el.tsv
Uploaded ../../temp\XNLI-MT-1.0\.DS_Store, 2 files out of an estimated total of 42
Uploading ../../temp\XNLI-MT-1.0\multinli\multinli.train.en.tsv
Uploading ../.



Uploading ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.bg.tsv
Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\._.DS_Store, 6 files out of an estimated total of 42
Uploading ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.de.tsv




Uploading ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.el.tsv




Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\._multinli, 7 files out of an estimated total of 42
Uploading ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.es.tsv
Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.de.tsv, 8 files out of an estimated total of 42
Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\._xnli, 9 files out of an estimated total of 42




Uploading ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.fr.tsv




Uploading ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.hi.tsv
Uploading ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.ru.tsv
Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.fr.tsv, 10 files out of an estimated total of 42
Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.ar.tsv, 11 files out of an estimated total of 42
Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.hi.tsv, 12 files out of an estimated total of 42
Uploading ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.sw.tsv




Uploaded ../../temp\XNLI-MT-1.0\multinli\multinli.train.fr.tsv, 13 files out of an estimated total of 42
Uploaded ../../temp\XNLI-MT-1.0\multinli\multinli.train.en.tsv, 14 files out of an estimated total of 42
Uploading ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.tr.tsv
Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.ru.tsv, 15 files out of an estimated total of 42
Uploaded ../../temp\XNLI-MT-1.0\multinli\multinli.train.de.tsv, 16 files out of an estimated total of 42




Uploading ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.ur.tsv
Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.sw.tsv, 17 files out of an estimated total of 42
Uploaded ../../temp\XNLI-MT-1.0\multinli\multinli.train.hi.tsv, 18 files out of an estimated total of 42
Uploaded ../../temp\XNLI-MT-1.0\multinli\multinli.train.el.tsv, 19 files out of an estimated total of 42




Uploading ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.vi.tsv
Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.tr.tsv, 20 files out of an estimated total of 42
Uploading ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.zh.tsv
Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.ur.tsv, 21 files out of an estimated total of 42




Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.bg.tsv, 22 files out of an estimated total of 42
Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.vi.tsv, 23 files out of an estimated total of 42
Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.zh.tsv, 24 files out of an estimated total of 42




Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.es.tsv, 25 files out of an estimated total of 42
Uploading ../../temp\__MACOSX\XNLI-MT-1.0\xnli\._.DS_Store
Uploaded ../../temp\XNLI-MT-1.0\multinli\multinli.train.zh.tsv, 26 files out of an estimated total of 42
Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\multinli\._multinli.train.el.tsv, 27 files out of an estimated total of 42
Uploaded ../../temp\__MACOSX\XNLI-MT-1.0\xnli\._.DS_Store, 28 files out of an estimated total of 42
Uploaded ../../temp\XNLI-MT-1.0\xnli\xnli.dev.en.jsonl, 29 files out of an estimated total of 42
Uploaded ../../temp\XNLI-MT-1.0\multinli\multinli.train.tr.tsv, 30 files out of an estimated total of 42
Uploaded ../../temp\XNLI-MT-1.0\multinli\multinli.train.vi.tsv, 31 files out of an estimated total of 42
Uploaded ../../temp\XNLI-MT-1.0\multinli\multinli.train.ar.tsv, 32 files out of an estimated total of 42
Uploaded ../../temp\XNLI-MT-1.0\multinli\multinli.train.es.tsv, 33 files out of an estimated

$AZUREML_DATAREFERENCE_cdd246243450416782214083cc48844e

In [None]:
%%writefile $project_dir/train.py

from utils_nlp.bert.sequence_classification import BERTSequenceClassifier
from utils_nlp.bert.common import Language, Tokenizer


from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

import numpy as np
import torch
import horovod.torch as hvd
import numpy as np
import os
import shutil
import argparse

# set random seeds
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)

# model configurations
LANGUAGE = Language.ENGLISH
TO_LOWER_CASE = True
MAX_SEQ_LENGTH = 128

# training configurations
NUM_GPUS = 1
BATCH_SIZE = 16 #training batchsize
NUM_EPOCHS = 1  # just for debugging! Hong uses 5 in her example

# optimizer configurations
LEARNING_RATE= 5e-5
WARMUP_PROPORTION= 0.1


# model configurations
LANGUAGE = Language.ENGLISH
TO_LOWER_CASE = True
MAX_SEQ_LENGTH = 128

# data configs
TEXT_COL = "text"
LABEL_COL = "label"

parser = argparse.ArgumentParser()
parser.add_argument('--data_folder', type=str, help='Folder where data is stored')
args = parser.parse_args()
xnli_folder = os.path.join(args.data_folder, "entailment_aml")
train_file =  os.path.join(xnli_folder,"/")

print("Create a tokenizer...")
tokenizer= Tokenizer(language=LANGUAGE, to_lower=TO_LOWER_CASE, cache_dir=CACHE_DIR)
train_tokens = tokenizer.tokenize(train_df[TEXT_COL])

print("Tokenize and preprocess text...")
#tokenize
train_token_ids, train_input_mask, train_token_type_ids = \
tokenizer.preprocess_classification_tokens(train_tokens, max_len=MAX_SEQ_LENGTH)

#preprocess
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df[LABEL_COL])
num_labels = len(np.unique(train_labels))

#for training split data on workers

print("Create classifier...")
classifier = BERTSequenceClassifier(language=LANGUAGE,
                                    num_labels=num_labels,
                                    cache_dir=CACHE_DIR)
        
print("Finetune classifier...")

classifier.fit(token_ids=train_token_ids,
               input_mask=train_input_mask,
               token_type_ids=train_token_type_ids,
               labels=train_labels_tensor,
               num_gpus=NUM_GPUS,
               num_epochs=NUM_EPOCHS,
               batch_size=BATCH_SIZE,
               lr=LEARNING_RATE,
               warmup_proportion=WARMUP_PROPORTION)


    

In [None]:
script_params = {
    "--data_folder": ds.as_mount(),
}


In [None]:
est = PyTorch(
    source_directory=project_dir,
    compute_target=compute_target,
    script_params=script_params,
    entry_script="train.py",
    node_count=2,
    distributed_training=MpiConfiguration(),
    use_gpu=True,
    framework_version="1.0",
    conda_packages=["scikit-learn=0.20.3", "numpy", "spacy", "nltk"],
    pip_packages=["pandas", "pytorch-pretrained-bert"],
)

In [None]:

experiment = Experiment(ws, name="entail-bert-xnli")
run = experiment.submit(est)

In [None]:
RunDetails(run).show()

In [None]:
run.cancel()

### Evaluate

### Predict and Evaluate

In [None]:
predictions_hindi = classifier_multi.predict(token_ids=test_token_ids_hindi,
                                             input_mask=test_input_mask_hindi,
                                             token_type_ids=test_token_type_ids_hindi,
                                             batch_size=BATCH_SIZE)
print("Prediction time : {:.3f} hrs".format(t.interval / 3600))
predictions_hindi= label_encoder_hindi.inverse_transform(predictions_hindi)
print(classification_report(test_df_hindi[LABEL_COL], predictions_hindi))