*Copyright (c) Microsoft Corporation. All rights reserved.*

*Licensed under the MIT License.*

# Text Classification of MultiNLI Sentences using PyTorch Transformers

In [1]:
# Import packages
import os
import sys
import json 
import pandas as pd
import numpy as np
import torch

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

from utils_nlp.dataset.multinli import load_pandas_df
from utils_nlp.models.transformers.sequence_classification import Processor, SequenceClassifier 
from utils_nlp.common.timer import Timer

I1001 17:16:49.833771 139876639942464 file_utils.py:39] PyTorch version 1.1.0 available.
I1001 17:16:49.869802 139876639942464 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [2]:
# Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs
QUICK_RUN = True

## Introduction 
## [TODO] - Modify for the final model
This notebook fine-tunes and evaluates a pretrained [XLNet](https://arxiv.org/pdf/1906.08237.pdf) model on a subset of the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset.

We use a [sequence classifier](../../utils_nlp/models/bert/sequence_classification.py) that wraps [Hugging Face's PyTorch implementation](https://github.com/huggingface/pytorch-pretrained-BERT) of Google's [BERT](https://github.com/google-research/bert).

In [3]:
TRAIN_DATA_FRACTION = 1
TEST_DATA_FRACTION = 1
NUM_EPOCHS = 3

if QUICK_RUN:
    TRAIN_DATA_FRACTION = 0.01
    TEST_DATA_FRACTION = 0.01
    NUM_EPOCHS = 1

BATCH_SIZE = 32 if torch.cuda.is_available() else 8
DATA_FOLDER = "./temp"
MODEL_CACHE_DIR = "./temp"
TO_LOWER = True
MAX_LEN = 150
BATCH_SIZE_PRED = 512
TRAIN_SIZE = 0.6
LABEL_COL = "genre"
TEXT_COL = "sentence1" 
TARGET_MODEL = "roberta-base"

### [TODO] - Remove Workflow overview

```
model_name = SequenceClassifier.list_supported_models()[0]
num_labels = len(label_encoder.classes_)
processor = Processor(model_name=model_name, cache_dir=temp_dir)
ds = processor.preprocess(text_train, labels_train, max_len=max_len)
classifier = SequenceClassifier(
    model_name=model_name, num_labels=num_labels, cache_dir=temp_dir
)
classifier.fit(ds, device="cuda", num_epochs=1, batch_size=32, num_gpus=None)
```

## Read Dataset

Let's start by loading a subset of the data.  

The following function downloads and extracts the files, if they don't already exists in the data folder.

The MultiNLI dataset is mainly used for natural language inference (NLI) tasks, where the inputs are sentence pairs and the labels are entailment indicators. The sentence pairs are also classified into *genres* that allow for more coverage and better evaluation of NLI models.

In [4]:
df = load_pandas_df(DATA_FOLDER, "train")

## Quick Analysis of Data  

Let's observe our dataset to see what we are working with.  
For our classification task, we use the first sentence only as the text input, and the corresponding genre as the label. We select the examples corresponding to one of the entailment labels (*neutral* in this case) to avoid duplicate rows, as the sentences are not unique, whereas the sentence pairs are.

In [5]:
df = df[df["gold_label"] == "neutral"] # Get unique sentences
df[[LABEL_COL, TEXT_COL]].head()

Unnamed: 0,genre,sentence1
0,government,Conceptually cream skimming has two basic dime...
4,telephone,yeah i tell you what though if you go price so...
6,travel,But a few Christian mosaics survive above the ...
12,slate,It's not that the questions they asked weren't...
13,travel,"Thebes held onto power until the 12th Dynasty,..."


The examples in the dataset, shown below, are grouped into 5 genres

### Train/Test Data Split 
Using SKlearns (model selection library), split the MNLI Dataset into training and testing. Based on the setting of the `QUICK_RUN` flag, we'll be sampling a fraction of the data for our model

In [7]:
df_train, df_test = train_test_split(df, train_size = TRAIN_SIZE, random_state = 0)
df_train = df_train.sample(frac=TRAIN_DATA_FRACTION).reset_index(drop=True)
df_test = df_test.sample(frac=TEST_DATA_FRACTION).reset_index(drop=True)
train_text = df_train[TEXT_COL]
test_text = df_test[TEXT_COL]



In [None]:
df[LABEL_COL].value_counts()

### Encode the labels into numeric values
Label Encoder makes it easy to encode dataset labels, categorical features into numerical values, between `0` and `n_classes - 1`; where `n` is the number of distinct labels

In [8]:
# encode labels
label_encoder = LabelEncoder()
train_data_labels = label_encoder.fit_transform(df_train[LABEL_COL])
test_data_labels = label_encoder.fit_transform(df_test[LABEL_COL])

# Count unique encoded labels
num_labels = len(np.unique(train_data_labels))

In [9]:
print(f"Number of unique labels: {num_labels}")
print(f"Number of training examples: {df_train.shape[0]}")
print(f"Number of testing examples: {df_test.shape[0]}")

Number of unique labels: 5
Number of training examples: 785
Number of testing examples: 524


In [10]:
# model_name = SequenceClassifier.list_supported_models()[0]
# num_labels = len(label_encoder.classes_)
# processor = Processor(model_name=model_name, cache_dir=temp_dir)
# ds = processor.preprocess(text_train, labels_train, max_len=max_len)
# classifier = SequenceClassifier(
#     model_name=model_name, num_labels=num_labels, cache_dir=temp_dir
# )
# classifier.fit(ds, device="cuda", num_epochs=1, batch_size=32, num_gpus=None)
# SequenceClassifier.list_supported_models()

### Preprocess Data For Training.  

Before training a model, the text document needs to be tokenized and converted to a list of tokens. Do the following steps to:  
1. Create a PyTorch Processor - Prepare and Tokenize data  
1. Initialize a RoBERTa PyTorch Transformer Processor 
1. Create a Dataset using the initialized processor  
1. Initialize a Sequence Classifier
1. Fit the newly created classifier model

In [11]:
supported_models = SequenceClassifier.list_supported_models()
assert TARGET_MODEL in supported_models, f"Unfortunately {TARGET_MODEL} is not currently supported"
processor = Processor(model_name=TARGET_MODEL, cache_dir=MODEL_CACHE_DIR)
train_dataset = processor.preprocess(text=train_text, labels=train_data_labels, max_len=MAX_LEN)
test_dataset = processor.preprocess(text=test_text, labels=test_data_labels, max_len=MAX_LEN)

I1001 17:17:09.713756 139876639942464 tokenization_utils.py:373] loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json from cache at ./temp/d0c5776499adc1ded22493fae699da0971c1ee4c2587111707a4d177d20257a2.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b
I1001 17:17:09.714879 139876639942464 tokenization_utils.py:373] loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt from cache at ./temp/b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda


### Create Model

Now, we will create a sequence classifier that loads a pre-trained RoBERTa model and the number of labels

In [12]:
classifier = SequenceClassifier(model_name=TARGET_MODEL, num_labels=num_labels, cache_dir=MODEL_CACHE_DIR)

I1001 17:17:10.692902 139876639942464 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json from cache at ./temp/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.9dad9043216064080cf9dd3711c53c0f11fe2b09313eaa66931057b4bdcaf068
I1001 17:17:10.694562 139876639942464 configuration_utils.py:168] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 5,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "use_bfloat16": false,
  "vocab_size": 50265
}

I1001 17:17:11.002182 139876639942464 modeling_utils.py:337] loading weights fi

### Train Model

We train the classifier  using the training examples from MNLI. This involves fine-tunning the transformer and a linear classification layer on top of that

In [13]:
with Timer() as t:
    classifier.fit(train_dataset)
print("[Training time: {:.3f} hrs]".format(t.interval / 3600))

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
                                            
Epoch:   0%|          | 0/1 [00:02<?, ?it/s]     
Iteration:   0%|          | 0/50 [00:02<?, ?it/s][A

Loss:1.652686



Iteration:   2%|▏         | 1/50 [00:02<02:23,  2.92s/it][A
Iteration:   4%|▍         | 2/50 [00:03<01:49,  2.28s/it][A
Iteration:   6%|▌         | 3/50 [00:04<01:27,  1.85s/it][A
Iteration:   8%|▊         | 4/50 [00:05<01:11,  1.56s/it][A
Iteration:  10%|█         | 5/50 [00:06<01:00,  1.35s/it][A
Iteration:  12%|█▏        | 6/50 [00:07<00:53,  1.21s/it][A
Iteration:  14%|█▍        | 7/50 [00:08<00:47,  1.11s/it][A
Iteration:  16%|█▌        | 8/50 [00:08<00:43,  1.04s/it][A
Iteration:  18%|█▊        | 9/50 [00:09<00:40,  1.01it/s][A
                                            38,  1.05it/s][A
Epoch:   0%|          | 0/1 [00:11<?, ?it/s]              
Iteration:  20%|██        | 10/50 [00:11<00:38,  1.05it/s][A

Loss:1.381926



Iteration:  22%|██▏       | 11/50 [00:11<00:36,  1.08it/s][A
Iteration:  24%|██▍       | 12/50 [00:12<00:34,  1.10it/s][A
Iteration:  26%|██▌       | 13/50 [00:13<00:33,  1.12it/s][A
Iteration:  28%|██▊       | 14/50 [00:14<00:31,  1.13it/s][A
Iteration:  30%|███       | 15/50 [00:14<00:30,  1.13it/s][A
Iteration:  32%|███▏      | 16/50 [00:15<00:29,  1.14it/s][A
Iteration:  34%|███▍      | 17/50 [00:16<00:29,  1.14it/s][A
Iteration:  36%|███▌      | 18/50 [00:17<00:28,  1.14it/s][A
Iteration:  38%|███▊      | 19/50 [00:18<00:27,  1.14it/s][A
                                            26,  1.14it/s][A
Epoch:   0%|          | 0/1 [00:19<?, ?it/s]              
Iteration:  40%|████      | 20/50 [00:19<00:26,  1.14it/s][A

Loss:1.561228



Iteration:  42%|████▏     | 21/50 [00:20<00:25,  1.14it/s][A
Iteration:  44%|████▍     | 22/50 [00:21<00:24,  1.14it/s][A
Iteration:  46%|████▌     | 23/50 [00:21<00:23,  1.14it/s][A
Iteration:  48%|████▊     | 24/50 [00:22<00:22,  1.14it/s][A
Iteration:  50%|█████     | 25/50 [00:23<00:21,  1.14it/s][A
Iteration:  52%|█████▏    | 26/50 [00:24<00:20,  1.15it/s][A
Iteration:  54%|█████▍    | 27/50 [00:25<00:20,  1.14it/s][A
Iteration:  56%|█████▌    | 28/50 [00:26<00:19,  1.15it/s][A
Iteration:  58%|█████▊    | 29/50 [00:27<00:18,  1.15it/s][A
                                            17,  1.15it/s][A
Epoch:   0%|          | 0/1 [00:28<?, ?it/s]              
Iteration:  60%|██████    | 30/50 [00:28<00:17,  1.15it/s][A

Loss:1.158241



Iteration:  62%|██████▏   | 31/50 [00:28<00:16,  1.15it/s][A
Iteration:  64%|██████▍   | 32/50 [00:29<00:15,  1.15it/s][A
Iteration:  66%|██████▌   | 33/50 [00:30<00:14,  1.15it/s][A
Iteration:  68%|██████▊   | 34/50 [00:31<00:13,  1.15it/s][A
Iteration:  70%|███████   | 35/50 [00:32<00:13,  1.14it/s][A
Iteration:  72%|███████▏  | 36/50 [00:33<00:12,  1.15it/s][A
Iteration:  74%|███████▍  | 37/50 [00:34<00:11,  1.15it/s][A
Iteration:  76%|███████▌  | 38/50 [00:35<00:10,  1.14it/s][A
Iteration:  78%|███████▊  | 39/50 [00:35<00:09,  1.15it/s][A
                                            08,  1.15it/s][A
Epoch:   0%|          | 0/1 [00:37<?, ?it/s]              
Iteration:  80%|████████  | 40/50 [00:37<00:08,  1.15it/s][A

Loss:0.924342



Iteration:  82%|████████▏ | 41/50 [00:37<00:07,  1.15it/s][A
Iteration:  84%|████████▍ | 42/50 [00:38<00:06,  1.15it/s][A
Iteration:  86%|████████▌ | 43/50 [00:39<00:06,  1.14it/s][A
Iteration:  88%|████████▊ | 44/50 [00:40<00:05,  1.14it/s][A
Iteration:  90%|█████████ | 45/50 [00:41<00:04,  1.14it/s][A
Iteration:  92%|█████████▏| 46/50 [00:42<00:03,  1.14it/s][A
Iteration:  94%|█████████▍| 47/50 [00:42<00:02,  1.15it/s][A
Iteration:  96%|█████████▌| 48/50 [00:43<00:01,  1.15it/s][A
Iteration:  98%|█████████▊| 49/50 [00:44<00:00,  1.15it/s][A
Epoch: 100%|██████████| 1/1 [00:45<00:00, 45.26s/it]7it/s][A

[Training time: 0.013 hrs]





### Score

We score the test set against the trained sequence classifier

In [14]:
preds = classifier.predict(test_dataset, device="cuda")

Evaluating: 100%|██████████| 33/33 [00:09<00:00,  3.72it/s]


### Evaluate Results

In [15]:
report = classification_report(test_data_labels, preds, target_names=label_encoder.classes_, output_dict=True) 
accuracy = accuracy_score(test_data_labels, preds )
print("accuracy: {}".format(accuracy))
print(json.dumps(report, indent=4, sort_keys=True))

accuracy: 0.6793893129770993
{
    "fiction": {
        "f1-score": 0.7094017094017093,
        "precision": 0.6335877862595419,
        "recall": 0.8058252427184466,
        "support": 103
    },
    "government": {
        "f1-score": 0.6229508196721312,
        "precision": 0.4797979797979798,
        "recall": 0.8878504672897196,
        "support": 107
    },
    "macro avg": {
        "f1-score": 0.6578039596633319,
        "precision": 0.7356734902078413,
        "recall": 0.6867853156090762,
        "support": 524
    },
    "micro avg": {
        "f1-score": 0.6793893129770993,
        "precision": 0.6793893129770993,
        "recall": 0.6793893129770993,
        "support": 524
    },
    "slate": {
        "f1-score": 0.24460431654676257,
        "precision": 0.68,
        "recall": 0.14912280701754385,
        "support": 114
    },
    "telephone": {
        "f1-score": 0.957345971563981,
        "precision": 0.9619047619047619,
        "recall": 0.9528301886792453,
        "