*Copyright (c) Microsoft Corporation. All rights reserved.*

*Licensed under the MIT License.*

# Text Classification of Yahoo Answers using BERT


In [None]:
import sys
sys.path.append("../../")
import os
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import utils_nlp.dataset.yahoo_answers as ya_dataset
from utils_nlp.eval.classification import eval_classification
from utils_nlp.bert.sequence_classification import SequenceClassifier
from utils_nlp.bert.common import Language, Tokenizer
from utils_nlp.common.timer import Timer
import torch
import torch.nn as nn
import numpy as np

In [None]:
DATA_FOLDER = "../../../.../temp"
TRAIN_FILE = "yahoo_answers_csv/train.csv"
TEST_FILE = "yahoo_answers_csv/test.csv"
BERT_CACHE_DIR = "../../../temp"
MAX_LEN = 300
BATCH_SIZE = 16
USE_GPU = True
NUM_EPOCHS = 1
NUM_ROWS_TRAIN = 10000 # number of training examples to read
NUM_ROWS_TEST = 10000  # number of test examples to read

## Download Dataset

In [None]:
ya_dataset.download(DATA_FOLDER)

## Read Dataset

In [None]:
# read data
df_train = ya_dataset.read_data(
    os.path.join(DATA_FOLDER, TRAIN_FILE), nrows=NUM_ROWS_TRAIN
)
df_test = ya_dataset.read_data(
    os.path.join(DATA_FOLDER, TEST_FILE), nrows=NUM_ROWS_TEST
)

# get labels
labels_train = ya_dataset.get_labels(df_train)
labels_test = ya_dataset.get_labels(df_test)

num_labels = len(np.unique(labels_train))

# get text
text_train = ya_dataset.get_text(df_train)
text_test = ya_dataset.get_text(df_test)

## Tokenize and Preprocess

In [None]:
tokenizer = Tokenizer(Language.ENGLISH, to_lower=False, cache_dir=BERT_CACHE_DIR)

# tokenize
tokens_train = tokenizer.tokenize(text_train)
tokens_test = tokenizer.tokenize(text_test)

# get BERT-format tokens (padded and truncated)
tokens_train, mask_train = tokenizer.preprocess_classification_tokens(
    tokens_train, MAX_LEN
)
tokens_test, mask_test = tokenizer.preprocess_classification_tokens(
    tokens_test, MAX_LEN
)

## Create Model


In [None]:
classifier = SequenceClassifier(
    language=Language.ENGLISH, num_labels=num_labels, cache_dir=BERT_CACHE_DIR
)

## Train

In [None]:
# train
with Timer() as t:
    classifier.fit(
        token_ids=tokens_train,
        input_mask=mask_train,
        labels=labels_train,    
        use_gpu=USE_GPU,        
        num_epochs=NUM_EPOCHS,
        batch_size=BATCH_SIZE,    
        verbose=True,
    )
    
print("[Training time: {:.3f} hrs]".format(t.interval / 3600))

## Score Test Set

In [None]:
preds = classifier.predict(
    token_ids=tokens_test, input_mask=mask_test, use_gpu=False, batch_size=BATCH_SIZE
)

## Evaluate Results

In [None]:
# eval metrics
accuracy = accuracy_score(labels_test, preds)
precision = precision_score(labels_test, preds, average=None)
recall = recall_score(labels_test, preds, average=None)
f1 = f1_score(labels_test, preds, average=None)

print("\n accuracy: {}".format(accuracy))
pd.DataFrame({"precision": precision, "recall": recall, "f1": f1})