*Copyright (c) Microsoft Corporation. All rights reserved.*

*Licensed under the MIT License.*

# Text Classification of Yahoo Answers using BERT


In [1]:
import sys
sys.path.append("../../")
import os
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import utils_nlp.dataset.yahoo_answers as ya_dataset
from utils_nlp.eval.classification import eval_classification
from utils_nlp.bert.sequence_classification import SequenceClassifier
from utils_nlp.bert.common import Language, Tokenizer
from utils_nlp.common.timer import Timer
import torch
import torch.nn as nn
import numpy as np

In [2]:
DATA_FOLDER = "./temp"
TRAIN_FILE = "yahoo_answers_csv/train.csv"
TEST_FILE = "yahoo_answers_csv/test.csv"
BERT_CACHE_DIR = "./temp"
MAX_LEN = 250
BATCH_SIZE = 16
NUM_GPUS = 2
NUM_EPOCHS = 1
NUM_ROWS_TRAIN = 50000 # number of training examples to read
NUM_ROWS_TEST = 20000  # number of test examples to read

## Download Dataset

In [3]:
if not os.path.exists(DATA_FOLDER):
    os.mkdir(DATA_FOLDER)
ya_dataset.download(DATA_FOLDER)

## Read Dataset

In [4]:
# read data
df_train = ya_dataset.read_data(
    os.path.join(DATA_FOLDER, TRAIN_FILE), nrows=NUM_ROWS_TRAIN
)
df_test = ya_dataset.read_data(
    os.path.join(DATA_FOLDER, TEST_FILE), nrows=NUM_ROWS_TEST
)

# get labels
labels_train = ya_dataset.get_labels(df_train)
labels_test = ya_dataset.get_labels(df_test)

num_labels = len(np.unique(labels_train))

# get text
text_train = ya_dataset.get_text(df_train)
text_test = ya_dataset.get_text(df_test)

## Tokenize and Preprocess

Before training, we tokenize the text documents and convert them to lists of tokens. The following steps instantiate a BERT tokenizer given the language, and tokenize the text of the training and test sets.

In [5]:
tokenizer = Tokenizer(Language.ENGLISH, to_lower=True, cache_dir=BERT_CACHE_DIR)

# tokenize
tokens_train = tokenizer.tokenize(text_train)
tokens_test = tokenizer.tokenize(text_test)

In addition, we perform the following preprocessing steps in the cell below:
- Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary
- Add sentence markers
- Pad or truncate the token lists to the specified max length

*See the original [implementation](https://github.com/google-research/bert/blob/master/run_classifier.py) for more information on BERT's input format.*

In [6]:
tokens_train, mask_train = tokenizer.preprocess_classification_tokens(
    tokens_train, MAX_LEN
)
tokens_test, mask_test = tokenizer.preprocess_classification_tokens(
    tokens_test, MAX_LEN
)

## Create Model


In [7]:
classifier = SequenceClassifier(
    language=Language.ENGLISH, num_labels=num_labels, cache_dir=BERT_CACHE_DIR
)

## Train

In [8]:
# train
with Timer() as t:
    classifier.fit(
        token_ids=tokens_train,
        input_mask=mask_train,
        labels=labels_train,    
        num_gpus=NUM_GPUS,        
        num_epochs=NUM_EPOCHS,
        batch_size=BATCH_SIZE,    
        verbose=True,
    )    
print("[Training time: {:.3f} hrs]".format(t.interval / 3600))

t_total value of -1 results in schedule not being applied


epoch:1/1; batch:1->313/3125; loss:2.469508
epoch:1/1; batch:314->626/3125; loss:1.179081
epoch:1/1; batch:627->939/3125; loss:0.677443
epoch:1/1; batch:940->1252/3125; loss:1.689727
epoch:1/1; batch:1253->1565/3125; loss:0.781167
epoch:1/1; batch:1566->1878/3125; loss:1.036024
epoch:1/1; batch:1879->2191/3125; loss:0.909294
epoch:1/1; batch:2192->2504/3125; loss:0.441344
epoch:1/1; batch:2505->2817/3125; loss:0.823389
epoch:1/1; batch:2818->3130/3125; loss:1.036229
[Training time: 1.132 hrs]


## Score Test Set

In [9]:
preds = classifier.predict(
    token_ids=tokens_test, input_mask=mask_test, num_gpus=NUM_GPUS, batch_size=BATCH_SIZE
)

100%|██████████| 20000/20000 [08:00<00:00, 41.85it/s]


## Evaluate Results
Finally, we compute the accuracy, precision, recall, and F1 metrics of the evaluation on the test set.

In [10]:
accuracy = accuracy_score(labels_test, preds)
precision = precision_score(labels_test, preds, average=None)
recall = recall_score(labels_test, preds, average=None)
f1 = f1_score(labels_test, preds, average=None)

print("\n accuracy: {}".format(accuracy))
pd.DataFrame({"precision": precision, "recall": recall, "f1": f1})


 accuracy: 0.6564


Unnamed: 0,precision,recall,f1
0,0.592506,0.497053,0.540598
1,0.74907,0.673518,0.709288
2,0.789308,0.680955,0.731139
3,0.561592,0.440535,0.493752
4,0.854772,0.789272,0.820717
5,0.885998,0.847659,0.866404
6,0.42544,0.687416,0.525592
7,0.756364,0.700337,0.727273
8,0.826006,0.485432,0.611496
9,0.756186,0.731039,0.7434
