*Copyright (c) Microsoft Corporation. All rights reserved.*


*Licensed under the MIT License.*

# Text Classification of Yahoo Answers using BERT


In [1]:
import sys
sys.path.append("../../")
import os
import pandas as pd
import utils_nlp.dataset.yahoo_answers as ya_dataset
from utils_nlp.eval.classification import eval_classification
from utils_nlp.classification.bert import BERTSequenceClassifier, Language
import torch
import torch.nn as nn
import numpy as np

In [2]:
DATA_FOLDER = "../../../.../temp"
TRAIN_FILE = "yahoo_answers_csv/train.csv"
TEST_FILE = "yahoo_answers_csv/test.csv"
BERT_CACHE_DIR = "../../../temp"
MAX_LEN = 100
BATCH_SIZE = 32
DEVICE = "gpu"
UPDATE_EMBEDDINGS = False
NUM_EPOCHS = 1
NUM_ROWS_TRAIN = 10000  # number of training examples to read
NUM_ROWS_TEST = 10000  # number of test examples to read

## Download Dataset

In [3]:
if (not os.path.isfile(os.path.join(DATA_FOLDER, TRAIN_FILE))) or (
    not os.path.isfile(os.path.join(DATA_FOLDER, TEST_FILE))
):
    ya_dataset.download(DATA_FOLDER)

## Read and Preprocess Dataset

In [4]:
# read data
df_train = ya_dataset.read_data(os.path.join(DATA_FOLDER, TRAIN_FILE), nrows=NUM_ROWS_TRAIN)
df_test = ya_dataset.read_data(os.path.join(DATA_FOLDER, TEST_FILE), nrows=NUM_ROWS_TEST)

# get labels
labels_train = ya_dataset.get_labels(df_train)
labels_test = ya_dataset.get_labels(df_test)

num_labels = len(np.unique(labels_train))

# get text
text_train = ya_dataset.get_text(df_train)
text_test = ya_dataset.get_text(df_test)

## Create Model


In [5]:
classifier = BERTSequenceClassifier(
    language=Language.ENGLISH, num_labels=num_labels, cache_dir=BERT_CACHE_DIR
)

In [6]:
# train
classifier.fit(text=text_train,
        labels=labels_train,
        max_len=MAX_LEN,
        device=DEVICE,
        use_multiple_gpus=True,
        num_epochs=NUM_EPOCHS,
        batch_size=BATCH_SIZE,
        verbose=True)

epoch:1/1; batch:1/312; loss:2.334407091140747
epoch:1/1; batch:97/312; loss:1.4464187622070312
epoch:1/1; batch:193/312; loss:0.4434894919395447
epoch:1/1; batch:289/312; loss:0.7371340394020081


## Score Test Set

In [None]:
preds = classifier.predict(text=text_test, device="gpu", batch_size=BATCH_SIZE)

## Evaluate Results

In [8]:
# eval
eval_results = eval_classification(labels_test, preds)
print("\n accuracy: {}".format(eval_results["accuracy"]))
pd.DataFrame(eval_results)[["precision", "recall", "f1"]]


 accuracy: 0.6608


Unnamed: 0,precision,recall,f1
0,0.5724,0.3835,0.4593
1,0.668,0.7676,0.7143
2,0.6569,0.8488,0.7406
3,0.4837,0.4521,0.4673
4,0.7724,0.8596,0.8136
5,0.8457,0.8317,0.8387
6,0.5804,0.5214,0.5493
7,0.6691,0.7032,0.6858
8,0.6826,0.713,0.6974
9,0.7459,0.6761,0.7093
