*Copyright (c) Microsoft Corporation. All rights reserved.*


*Licensed under the MIT License.*

# Text Classification of Yahoo Answers using BERT


In [None]:
import sys
sys.path.append("../../")
import os
import pandas as pd
import utils_nlp.dataset.yahoo_answers as ya_dataset
from utils_nlp.eval.classification import eval_classification
from utils_nlp.classification.bert import BERTSequenceClassifier, Language
import torch
import torch.nn as nn
import numpy as np
from sklearn.metrics import f1_score

In [None]:
DATA_FOLDER = "../../../temp"
TRAIN_FILE = "yahoo_answers_csv/train.csv"
TEST_FILE = "yahoo_answers_csv/test.csv"
BERT_CACHE_DIR = "../../../temp"
MAX_LEN = 100
BATCH_SIZE = 32
DEVICE = "gpu"
UPDATE_EMBEDDINGS = False
NUM_EPOCHS = 1
NUM_ROWS_TRAIN = 10000  # number of training examples to read
NUM_ROWS_TEST = 10000  # number of test examples to read

## Download Dataset

In [None]:
if (not os.path.isfile(os.path.join(DATA_FOLDER, TRAIN_FILE))) or (
    not os.path.isfile(os.path.join(DATA_FOLDER, TEST_FILE))
):
    ya_dataset.download(DATA_FOLDER)

## Read and Preprocess Dataset

In [None]:
# read data
df_train = ya_dataset.read_data(os.path.join(DATA_FOLDER, TRAIN_FILE), nrows=NUM_ROWS_TRAIN)
df_test = ya_dataset.read_data(os.path.join(DATA_FOLDER, TEST_FILE), nrows=NUM_ROWS_TEST)

# get labels
labels_train = ya_dataset.get_labels(df_train)
labels_test = ya_dataset.get_labels(df_test)

num_labels = len(np.unique(labels_train))

# get text
text_train = ya_dataset.get_text(df_train)
text_test = ya_dataset.get_text(df_test)

## Create Model


In [None]:
classifier = BERTSequenceClassifier(
    pretrained_model=Language.ENGLISH, num_labels=num_labels, cache_dir=BERT_CACHE_DIR
)

In [None]:
# train
classifier.fit(text= text_train,
        labels=labels_train,
        max_len=MAX_LEN,
        device=DEVICE,
        use_multiple_gpus=True,
        num_epochs=NUM_EPOCHS,
        batch_size=BATCH_SIZE,
        verbose=True)

## Score Test Set

In [None]:
preds = classifier.predict(text = text_test, device="gpu", batch_size=BATCH_SIZE)

## Evaluate Results

In [44]:
# eval
eval_results = eval_classification(labels_test, preds)
print("\n accuracy: {}".format(eval_results["accuracy"]))
pd.DataFrame(eval_results)[["precision", "recall", "f1"]]


 accuracy: 0.6671


Unnamed: 0,precision,recall,f1
0,0.5631,0.4327,0.4894
1,0.7239,0.7278,0.7259
2,0.7084,0.7652,0.7357
3,0.5782,0.3747,0.4547
4,0.8288,0.8178,0.8233
5,0.8082,0.8717,0.8388
6,0.5457,0.597,0.5702
7,0.6166,0.7558,0.6792
8,0.6701,0.7693,0.7163
9,0.7593,0.6527,0.702
