*Copyright (c) Microsoft Corporation. All rights reserved.*


*Licensed under the MIT License.*

# Text Classification of Yahoo Answers using BERT


In [None]:
import sys
sys.path.append("../../")
import os
import pandas as pd
import utils_nlp.dataset.yahoo_answers as ya_dataset
from utils_nlp.eval.classification import eval_classification
import torch
import torch.nn as nn
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.tokenization import BertTokenizer
import numpy as np
from sklearn.metrics import f1_score

In [None]:
DATA_FOLDER = "../../../temp"
TRAIN_FILE = "yahoo_answers_csv/train.csv"
TEST_FILE = "yahoo_answers_csv/test.csv"
BERT_CACHE_DIR = "../../../temp"
MAX_LEN = 100
BATCH_SIZE = 32
UPDATE_EMBEDDINGS = False
NUM_EPOCHS = 1
NUM_ROWS_TRAIN = 100000  # number of training examples to read

## Download Dataset

In [None]:
if (not os.path.isfile(os.path.join(DATA_FOLDER, TRAIN_FILE))) or (
    not os.path.isfile(os.path.join(DATA_FOLDER, TEST_FILE))
):
    ya_dataset.download(DATA_FOLDER)

## Read and Preprocess Dataset

In [None]:
# read data
df_train = ya_dataset.read_data(os.path.join(DATA_FOLDER, TRAIN_FILE), nrows=NUM_ROWS_TRAIN)
df_test = ya_dataset.read_data(os.path.join(DATA_FOLDER, TEST_FILE), nrows=None)
y_train = ya_dataset.get_labels(df_train)
y_test = ya_dataset.get_labels(df_test)

num_train_examples = df_train.shape[0]
num_test_examples = df_test.shape[0]
num_labels = len(np.unique(y_train))

# clean/get text
text_train = ya_dataset.clean_data(df_train)
text_test = ya_dataset.clean_data(df_test)

# get tokenizer
tokenizer = BertTokenizer.from_pretrained(
    "bert-base-uncased", do_lower_case=True, cache_dir=BERT_CACHE_DIR
)
# tokenize and truncate
tokens_train = [tokenizer.tokenize(x)[0 : MAX_LEN - 2] for x in text_train]
tokens_test = [tokenizer.tokenize(x)[0 : MAX_LEN - 2] for x in text_test]

# BERT format
tokens_train = [["[CLS]"] + x + ["[SEP]"] for x in tokens_train]
tokens_test = [["[CLS]"] + x + ["[SEP]"] for x in tokens_test]

# convert tokens to ids
tokens_train = [tokenizer.convert_tokens_to_ids(x) for x in tokens_train]
tokens_test = [tokenizer.convert_tokens_to_ids(x) for x in tokens_test]

# pad
tokens_train = [x + [0] * (MAX_LEN - len(x)) for x in tokens_train]
tokens_test = [x + [0] * (MAX_LEN - len(x)) for x in tokens_test]

# create input mask
input_mask_train = [[min(1, x) for x in y] for y in tokens_train]
input_mask_test = [[min(1, x) for x in y] for y in tokens_test]

## Set Device

In [None]:
# set device
device_str = "cuda:0" if torch.cuda.is_available() else "cpu"
device = torch.device(device_str)
print("using {} ...".format(device_str))

## Create Model


In [None]:
# define model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", cache_dir=BERT_CACHE_DIR, num_labels=num_labels
).to(device)

# define loss function
loss_func = nn.CrossEntropyLoss().to(device)

# define optimizer and model parameters
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.01,
    },
    {
        "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
opt = BertAdam(optimizer_grouped_parameters, lr=2e-5)

In [None]:
# check whether embedding layer is trainable
if not UPDATE_EMBEDDINGS:
    for p in model.bert.embeddings.parameters():
        p.requires_grad = False

In [None]:
# use multiple GPUs if available
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
print("using {} GPUs...".format(torch.cuda.device_count()))

## Train Model

In [None]:
# train
model.train()
num_batches = int(num_train_examples / BATCH_SIZE)
for epoch in range(NUM_EPOCHS):
    for i in range(num_batches):
        X_batch, mask_batch, y_batch = ya_dataset.get_batch_rnd(
            tokens_train, input_mask_train, y_train, num_train_examples, BATCH_SIZE
        )
        X_batch = torch.tensor(X_batch, dtype=torch.long, device=device)
        y_batch = torch.tensor(y_batch, dtype=torch.long, device=device)
        mask_batch = torch.tensor(mask_batch, dtype=torch.long, device=device)
        opt.zero_grad()
        y_h = model(X_batch, None, mask_batch, labels=None)
        loss = loss_func(y_h, y_batch)
        loss.backward()
        opt.step()
        if i % int(0.01 * num_batches) == 0:
            print(
                "epoch:{}/{}; batch:{}/{}; loss:{}".format(
                    epoch + 1, NUM_EPOCHS, i + 1, num_batches, loss.data
                )
            )

## Score Test Set

In [None]:
# score
model.eval()
preds = []
for i in range(0, num_test_examples, BATCH_SIZE):
    X_batch, mask_batch, y_batch = ya_dataset.get_batch_by_idx(
        tokens_test, input_mask_test, y_test, i, BATCH_SIZE
    )
    X_batch = torch.tensor(X_batch, dtype=torch.long, device=device)
    y_batch = torch.tensor(y_batch, dtype=torch.long, device=device)
    mask_batch = torch.tensor(mask_batch, dtype=torch.long, device=device)
    with torch.no_grad():
        p_batch = model(X_batch, None, mask_batch, labels=None)
    preds.append(p_batch.cpu().data.numpy())

preds = [x.argmax(1) for x in preds]
preds = np.concatenate(preds)

## Evaluate Results

In [None]:
# eval
eval_results = eval_classification(y_test, preds)
print("accuracy: {}".format(eval_results["accuracy"]))
print("precision: {}".format(eval_results["precision"]))
print("recall: {}".format(eval_results["recall"]))