*Copyright (c) Microsoft Corporation. All rights reserved.*

*Licensed under the MIT License.*

# Text Classification of MultiNLI Sentences using XLNet

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../../")
import os
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from utils_nlp.dataset.multinli import load_pandas_df
from utils_nlp.eval.classification import eval_classification
from utils_nlp.common.timer import Timer
from utils_nlp.models.xlnet.common import Language, Tokenizer
from utils_nlp.models.xlnet.sequence_classification import XLNetSequenceClassifier
from utils_nlp.models.xlnet.utils import generate_confusion_matrix
from utils_nlp.models.xlnet.common import log_xlnet_params
import mlflow
import datetime

## Introduction
In this notebook, we fine-tune and evaluate a pretrained [XLNet](https://arxiv.org/abs/1906.08237) model on a subset of the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset.

We use a [sequence classifier](../../utils_nlp/xlnet/sequence_classification.py) that wraps [Hugging Face's PyTorch implementation](https://github.com/huggingface/pytorch-transformers) of CMU and Google's [XLNet](https://github.com/zihangdai/xlnet).

In [3]:
DATA_FOLDER = "../../../temp"
XLNET_CACHE_DIR="../../../temp"
LANGUAGE = Language.ENGLISHCASED
MAX_SEQ_LENGTH = 128
BATCH_SIZE = 4
NUM_GPUS = 1
NUM_EPOCHS = 1
TRAIN_SIZE = 0.6
LABEL_COL = "genre"
TEXT_COL = "sentence1"
WEIGHT_DECAY = 0.0
WARMUP_STEPS = 0

### Hyperparamters to tune
MAX_SEQ_LENGTH = 128
LEARNING_RATE = 5e-5
ADAM_EPSILON = 1e-8

DEBUG = True
LOGGING_STEPS = 10
SAVE_STEPS = 10
mlflow.start_run(run_name = datetime.datetime.now())
log_xlnet_params(locals())

## Read Dataset
We start by loading a subset of the data. The following function also downloads and extracts the files, if they don't exist in the data folder.

The MultiNLI dataset is mainly used for natural language inference (NLI) tasks, where the inputs are sentence pairs and the labels are entailment indicators. The sentence pairs are also classified into *genres* that allow for more coverage and better evaluation of NLI models.

For our classification task, we use the first sentence only as the text input, and the corresponding genre as the label. We select the examples corresponding to one of the entailment labels (*neutral* in this case) to avoid duplicate rows, as the sentences are not unique, whereas the sentence pairs are.

In [4]:
df = load_pandas_df(DATA_FOLDER, "train")
df = df[df["gold_label"]=="neutral"]  # get unique sentences

if DEBUG:
    inds = random.sample(range(len(df.index)), 1000)
    df = df.iloc[inds]

In [5]:
df.head()

Unnamed: 0,annotator_labels,genre,gold_label,pairID,promptID,sentence1,sentence1_binary_parse,sentence1_parse,sentence2,sentence2_binary_parse,sentence2_parse
155920,[neutral],slate,neutral,124353n,124353,"For my part, I plan to endorse the Baptist boy...","( ( For ( my part ) ) ( , ( I ( ( plan ( to ( ...",(ROOT (S (PP (IN For) (NP (PRP$ my) (NN part))...,Boycotting will send a message.,( Boycotting ( ( will ( send ( a message ) ) )...,(ROOT (S (NP (NNP Boycotting)) (VP (MD will) (...
211456,[neutral],government,neutral,106327n,106327,"CSIS Policy Summit on Global Aging, Washington...",( ( CSIS Policy ) ( ( ( Summit ( on ( Global (...,(ROOT (S (NP (NNP CSIS) (NNP Policy)) (VP (VBD...,More than 20 guest speakers attended this summit.,( ( ( More ( than 20 ) ) ( guest speakers ) ) ...,(ROOT (S (NP (QP (JJR More) (IN than) (CD 20))...
293188,[neutral],slate,neutral,71440n,71440,The beers used in the experiment were as,( ( ( The beers ) ( used ( in ( the experiment...,(ROOT (S (NP (NP (DT The) (NNS beers)) (VP (VB...,The beers were part of the experiment conducte...,( ( The beers ) ( ( were ( part ( of ( ( the e...,(ROOT (S (NP (DT The) (NNS beers)) (VP (VBD we...
275458,[neutral],government,neutral,15842n,15842,Postal Service and the given post.,( ( ( ( Postal Service ) and ) ( the ( given p...,(ROOT (NP (NP (NP (NNP Postal) (NNP Service)) ...,the postal services is at an address,( ( the ( postal services ) ) ( is ( at ( an a...,(ROOT (S (NP (DT the) (JJ postal) (NNS service...
85808,[neutral],telephone,neutral,83172n,83172,and the new place doesn't have air conditionin...,( and ( ( the ( new place ) ) ( ( ( ( does n't...,(ROOT (FRAG (CC and) (NP (NP (DT the) (JJ new)...,Air conditioning costs a lot of money so not h...,( ( Air conditioning ) ( ( ( costs ( ( a lot )...,(ROOT (S (NP (NNP Air) (NNP conditioning)) (VP...


The examples in the dataset are grouped into 5 genres:

In [6]:
df[LABEL_COL].value_counts()

slate         223
government    206
telephone     202
travel        191
fiction       178
Name: genre, dtype: int64

We split the data for training and testing, and encode the class labels:

In [7]:
# split
df_train, df_test = train_test_split(df, train_size = TRAIN_SIZE)

# encode labels
label_encoder = LabelEncoder()
labels_train = label_encoder.fit_transform(df_train[LABEL_COL])
labels_test = label_encoder.transform(df_test[LABEL_COL])
label_list = label_encoder.classes_

num_labels = len(np.unique(labels_train))

In [8]:
print("Number of unique labels: {}".format(num_labels))
print("Number of training examples: {}".format(df_train.shape[0]))
print("Number of testing examples: {}".format(df_test.shape[0]))

Number of unique labels: 5
Number of training examples: 600
Number of testing examples: 400


## Tokenize and Preprocess
Before training, we tokenize the text documents and convert them to lists of tokens. The following steps instantiate a XLNet tokenizer given the language, and tokenize the text of the training and testing sets.

We perform the following preprocessing steps in the cell below:
- Convert the tokens into token indices corresponding to the XLNet-base tokenizer's vocabulary
- Add the special tokens [CLS] and [SEP] to mark the end of a sentence
- Pad or truncate the token lists to the specified max length
- Return id lists that indicate which word the tokens map to
- Return mask lists that indicate paddings' positions
- Return segment type id lists that indicates which segment each the tokens belongs to

*See the pytorch-transformer [implementation](https://github.com/huggingface/pytorch-transformers/blob/master/examples/utils_glue.py) for more information on XLNet's input format.*

In [9]:
tokenizer = Tokenizer(LANGUAGE, cache_dir=XLNET_CACHE_DIR)

train_input_ids, train_input_mask, train_segment_ids = tokenizer.preprocess_classification_tokens(list(df_train[TEXT_COL]), MAX_SEQ_LENGTH)
test_input_ids, test_input_mask, test_segment_ids = tokenizer.preprocess_classification_tokens(list(df_test[TEXT_COL]), MAX_SEQ_LENGTH)

## Create Model
Next, we create a sequence classifier that loads a pre-trained XLNet model, given the language and number of labels.

In [10]:
classifier = XLNetSequenceClassifier(
    language=LANGUAGE,
    num_labels=num_labels,
    cache_dir=XLNET_CACHE_DIR,
    num_gpus=NUM_GPUS,        
    num_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE
)

## Train
We train the classifier using the training examples. This involves fine-tuning the XLNet Transformer and learning a linear classification layer on top of that:

In [11]:
with Timer() as t:
    classifier.fit(
        token_ids=train_input_ids,
        input_mask=train_input_mask,
        token_type_ids=train_segment_ids,
        labels=labels_train,  
        verbose=True,
        logging_steps = LOGGING_STEPS,
        save_steps = SAVE_STEPS,
    )    
print("[Training time: {:.3f} hrs]".format(t.interval / 3600))

Iteration:   1%|          | 1/150 [00:00<01:09,  2.15it/s]

epoch:1/1; batch:1->16/150; average training loss:1.566105


Iteration:  11%|█▏        | 17/150 [00:11<01:07,  1.96it/s]

epoch:1/1; batch:17->32/150; average training loss:1.768401


Iteration:  22%|██▏       | 33/150 [00:24<01:53,  1.03it/s]

epoch:1/1; batch:33->48/150; average training loss:1.676888


Iteration:  33%|███▎      | 49/150 [00:32<00:42,  2.39it/s]

epoch:1/1; batch:49->64/150; average training loss:1.614675


Iteration:  43%|████▎     | 65/150 [00:53<01:16,  1.11it/s]

epoch:1/1; batch:65->80/150; average training loss:1.513738


Iteration:  54%|█████▍    | 81/150 [01:18<01:57,  1.71s/it]

epoch:1/1; batch:81->96/150; average training loss:1.464170


Iteration:  65%|██████▍   | 97/150 [01:25<00:22,  2.39it/s]

epoch:1/1; batch:97->112/150; average training loss:1.415893


Iteration:  66%|██████▌   | 99/150 [01:26<00:20,  2.47it/s]

KeyboardInterrupt: 

## Score
We score the test set using the trained classifier:

In [None]:
preds = classifier.predict(
    token_ids=test_input_ids,
    input_mask=test_input_mask,
    token_type_ids=test_segment_ids,
    num_gpus=NUM_GPUS,
    batch_size=BATCH_SIZE,
    probabilities=False
)

## Evaluate Results
Finally, we compute the accuracy, precision, recall, and F1 metrics of the evaluation on the test set.

In [None]:
cls_report = classification_report(labels_test, preds, target_names=label_encoder.classes_,output_dict=True)
print(classification_report(labels_test, preds, target_names=label_encoder.classes_))

cls_report_df = pd.DataFrame(cls_report)
cls_report_df.to_csv(path_or_buf=os.path.join(os.getcwd(),"checkpoints","cls_report.csv"))
mlflow.log_artifact(os.path.join(os.getcwd(),"checkpoints","cls_report.csv"))
mlflow.end_run()

### Generate Confusion Matrix using Seaborn

In [None]:
generate_confusion_matrix(labels_test,preds,label_encoder.classes_)