*Copyright (c) Microsoft Corporation. All rights reserved.*

*Licensed under the MIT License.*

# Text Classification of RTE Sentences using MT-DNN

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import torch
sys.path.append("../../")

from utils_nlp.bert.common import Language
from utils_nlp.dataset.glue import download_glue_data, load_pandas_df, build_data
from utils_nlp.mtdnn.data_utils import label_map
from utils_nlp.mtdnn.sequence_classification import MTDNNSequenceClassifier

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


## Introduction
In this notebook, we fine-tune and evaluate a pretrained [MT-DNN](https://arxiv.org/abs/1901.11504) model on the [RTE](https://www.nyu.edu/projects/bowman/glue.pdf) dataset.

In [3]:
DATA_FOLDER = "../../tmp" #"./temp"
BERT_CACHE_DIR = "../../tmp" #"./temp"
LANGUAGE = Language.ENGLISH
# TO_LOWER = True
# MAX_LEN = 150
# BATCH_SIZE = 32
# NUM_GPUS = 2
# NUM_EPOCHS = 1
# TRAIN_SIZE = 0.6
# LABEL_COL = "genre"
# TEXT_COL = "sentence1"

## Read Dataset
We start by loading a subset of the data. The following function also downloads and extracts the files, if they don't exist in the data folder.

In [None]:
download_glue_data(dest_path=DATA_FOLDER, tasks="RTE")

In [None]:
rte_train_data  = load_pandas_df(local_cache_path=DATA_FOLDER, task="RTE", file_split="train")

In [None]:
rte_train_data .head()

In [None]:
rte_dev_data = load_pandas_df(local_cache_path=DATA_FOLDER, task="RTE", file_split="dev")
rte_test_data = load_pandas_df(local_cache_path=DATA_FOLDER, task="RTE", file_split="test")

## Tokenize and Preprocess

Before training, we tokenize the text documents and convert them to lists of tokens. The following steps instantiate a BERT tokenizer given the language, and tokenize the text of the training and testing sets.

In [None]:
label_dict = label_map.GLOBAL_MAP["rte"]

#rte_train_fout = os.path.join(DATA_FOLDER, "RTE", "rte_train.json")
#rte_dev_fout = os.path.join(DATA_FOLDER, "RTE", "rte_dev.json")
rte_test_fout = os.path.join(DATA_FOLDER, "RTE", "rte_test.json")
#build_data(rte_train_data, rte_train_fout, label_dict)
#build_data(rte_dev_data, rte_dev_fout, label_dict)
build_data(rte_test_data, rte_test_fout, label_dict)

In [None]:
label_dict['entailment']

## Create Model
Next, we create a sequence classifier that loads a pre-trained MT-DNN model, given the language and number of labels.

In [None]:
import subprocess
cache_dir = "."
MODEL_FILE = "mt_dnn_large.pt"
MTDNN_URL = "https://mrc.blob.core.windows.net/mt-dnn-model/" + MODEL_FILE
model_path = os.path.join(cache_dir, "mt_dnn_large.pt")
bash_command = "wget" + " " + MTDNN_URL + " " + "-O" + model_path
subprocess.run(bash_command.split())

In [None]:
model = torch.load(model_path)

In [None]:
model.items()

In [4]:
classifier = MTDNNSequenceClassifier(
    language=LANGUAGE, num_labels=2, cache_dir=BERT_CACHE_DIR
)

In [15]:
classifier.fit(os.path.join(DATA_FOLDER, "RTE"), 
               "rte",
               "0,1,2,3")

06/11/2019 04:14:10 0
06/11/2019 04:14:10 0
06/11/2019 04:14:10 0
06/11/2019 04:14:10 0
06/11/2019 04:14:10 0
06/11/2019 04:14:10 0
06/11/2019 04:14:10 0
06/11/2019 04:14:10 Launching the MT-DNN training
06/11/2019 04:14:10 Launching the MT-DNN training
06/11/2019 04:14:10 Launching the MT-DNN training
06/11/2019 04:14:10 Launching the MT-DNN training
06/11/2019 04:14:10 Launching the MT-DNN training
06/11/2019 04:14:10 Launching the MT-DNN training
06/11/2019 04:14:10 Launching the MT-DNN training
06/11/2019 04:14:10 Loading ../../tmp/RTE/rte_train.json as task 0
06/11/2019 04:14:10 Loading ../../tmp/RTE/rte_train.json as task 0
06/11/2019 04:14:10 Loading ../../tmp/RTE/rte_train.json as task 0
06/11/2019 04:14:10 Loading ../../tmp/RTE/rte_train.json as task 0
06/11/2019 04:14:10 Loading ../../tmp/RTE/rte_train.json as task 0
06/11/2019 04:14:10 Loading ../../tmp/RTE/rte_train.json as task 0
06/11/2019 04:14:10 Loading ../../tmp/RTE/rte_train.json as task 0
Loaded 2489 samples out of 