# BERT Binary Text Classification

In [0]:
#!pip install transformers -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install git+https://github.com/huggingface/transformers

In [0]:
import csv, os, sys, logging, time, pickle, torch
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data.sampler import RandomSampler, SequentialSampler
from torch.nn import CrossEntropyLoss
from torch.nn.utils import clip_grad_norm_
from transformers import BertTokenizer, BertForSequenceClassification
from transformers.optimization import AdamW
from transformers.optimization import get_linear_schedule_with_warmup as WarmupLinearSchedule
from tqdm import tqdm
from sklearn.metrics import matthews_corrcoef, confusion_matrix

In [0]:
!wget http://120.79.8.250:8080/dataset/train.csv  #-O datasets/yelp_review_polarity/train.csv
!wget http://120.79.8.250:8080/dataset/test.csv #-O datasets/yelp_review_polarity/test.csv

In [0]:
# The input data dir. Should contain the .tsv files (or other data files) for the task.
DATA_DIR = "/content/" # "datasets/yelp_review_polarity/"
# This is where BERT will look for pre-trained models to load parameters from.
CACHE_DIR = "/content/cache/"
# The name of the task to train. I'm going to name this 'yelp'.
TASK_NAME = '/content/yelp'
# The output directory where the fine-tuned model and checkpoints will be written.
OUTPUT_DIR = f'/content/outputs/{TASK_NAME}/'
# The directory where the evaluation reports will be written to.
REPORTS_DIR = f'/content/reports/{TASK_NAME}_evaluation_reports/'
WEIGHTS_NAME = "/content/pytorch_model.bin"
CONFIG_NAME = "/content/config.json"

# The following variables are for training.
TRAIN_BATCH_SIZE = 16
GRADIENT_ACCUMULATION_STEPS = 1 
NUM_TRAIN_EPOCHS = 1
# The maximum total input sequence length after WordPiece tokenization.
# Sequences longer than this will be truncated, and sequences shorter than this will be padded.
MAX_SEQ_LENGTH = 128
OUTPUT_MODE = 'classification'
# Bert pre-trained model selected in the list: bert-base-uncased, 
# bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,
# bert-base-multilingual-cased, bert-base-chinese.
BERT_MODEL = "bert-base-cased"
LEARNING_RATE = 2e-5
# Deprecated for `BertAdam`
# WARMUP_PROPORTION = 0.1 
NUM_WARMUP_STEPS = 100
MAX_GRAD_NORM = 1.0

# More variables for evaluation.
EVAL_BATCH_SIZE = 8
RANDOM_SEED = 42

## Data Preparation
We are going to use [the Yelp Review Polarity dataset](https://s3.amazonaws.com/fast-ai-nlp/yelp_review_polarity_csv.tgz) to train our binary text classification model. First load the dataset in with `pandas` and take a look at it.

In [0]:
train_df = pd.read_csv(DATA_DIR + "train.csv", header=None)
train_df.head()

In [0]:
test_df = pd.read_csv(DATA_DIR + "test.csv", header=None)
test_df.head()

BERT, however, wants data to be in a `tsv` file with a specific format as given below (Four columns, and no header row).

* *Column 0:* An ID for the row
* *Column 1:* The label for the row (should be an int)
* *Column 2:* A column of the same letter for all rows. BERT wants this so we’ll give it, but we don’t have a use for it.
* *Column 3:* The text for the row

Let's make things a little BERT-friendly.

In [0]:
train_df_bert = pd.DataFrame({
    "id": range(len(train_df)),
    "label": train_df[3],
    "alpha": ["a"] * train_df.shape[0],
    "text": train_df[2].replace(r'\n', ' ', regex=True)
})

train_df_bert.head()

In [0]:
dev_df_bert = pd.DataFrame({
    "id": range(len(test_df)),
    "label": test_df[3],
    "alpha": ["a"] * test_df.shape[0],
    "text": test_df[2].replace(r'\n', ' ', regex=True)
})

dev_df_bert.head()

For convenience, I've named the test data as dev data. The convenience stems from the fact that BERT comes with data loading classes that expects **train** and **dev** files in the above format. We can use the train data to train our model, and the dev data to evaluate its performance. 

BERT's data loading classes can also use a **test** file but it expects the test file to be unlabelled.

Now that we have the data in the correct form, all we need to do is to save the train and dev data as `.tsv` files.

In [0]:
train_df_bert.to_csv(DATA_DIR + 'train.tsv', sep='\t', index=False, header=False)

In [0]:
dev_df_bert.to_csv(DATA_DIR + 'dev.tsv', sep='\t', index=False, header=False)

## Data to Features
The final step before fine-tuning is to convert the data into features that BERT uses.

The first class, `InputExample`, is the format that a single example of our dataset should be in. We won't be using the `text_b` attribute since that is not necessary for our binary classification task.

The other two classes, `DataProcessor` and `BinaryClassificationProcessor`, are helper classes that can be used to read in `.tsv` files and prepare them to be converted into features that will ultimately be fed into the actual BERT model.

In [0]:
# Increase CSV reader's field limit in case we have long text
csv.field_size_limit(2147483647)

class InputExample(object):
    """
    A single training/test example for simple sequence classification.
    """
    
    def __init__(self, guid, text_a, text_b=None, label=None):
        """
        Construct an `InputExample`.
        
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence. Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label
        
class DataProcessor(object):
    """
    Base class for data converters for sequence classification data sets.
    """
    
    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines

class BinaryClassificationProcessor(DataProcessor):
    """
    Processor for binary classification dataset.
    """

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = line[3]
            label = line[1]
            examples.append(InputExample(guid=guid, 
                                         text_a=text_a, 
                                         text_b=None, 
                                         label=label))
        return examples

So far, we have the capability to read in `.tsv` datasets and convert them into `InputExample` objects. BERT, being a neural network, cannot directly deal with text as we have in `InputExample` objects. The next step is to convert them into `InputFeatures`.

BERT has a constraint on the maximum length of a sequence after tokenizing. For any BERT model, the maximum sequence length after tokenization is $512$. But we can set any sequence length equal to or below this value. For faster training, I'll be using $128$ as the maximum sequence length. A bigger number may give better results if there are sequences longer than this value.

An `InputFeatures` consists of purely numerical data (with the proper sequence lengths) that can then be fed into the BERT model. This is prepared by tokenizing the text of each example and truncating the longer sequence while padding the shorter sequences to the given maximum sequence length ($128$).

In [0]:
class InputFeatures(object):
    """
    A single set of features of data.
    """

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """
    Truncates a sequence pair in place to the maximum length.
    """

    # This is a simple heuristic which will always truncate the longer sequence one token at a time. 
    # This makes more sense than truncating an equal percent of tokens from each, since if one sequence 
    # is very short then each token that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

def convert_example_to_feature(example_row):
    # Input `example_row`
    example, label_map, max_seq_length, tokenizer, output_mode = example_row

    tokens_a = tokenizer.tokenize(example.text_a)

    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)
        # Modify `tokens_a` and `tokens_b` in place so that the total length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:(max_seq_length - 2)]

    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    segment_ids = [0] * len(tokens)

    if tokens_b:
        tokens += tokens_b + ["[SEP]"]
        segment_ids += [1] * (len(tokens_b) + 1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length
    padding = [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    input_mask += padding
    segment_ids += padding

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    if output_mode == "classification":
        label_id = label_map[example.label]
    elif output_mode == "regression":
        label_id = float(example.label)
    else:
        raise KeyError(output_mode)

    return InputFeatures(input_ids=input_ids,
                         input_mask=input_mask,
                         segment_ids=segment_ids,
                         label_id=label_id)

## Pickling
In the following cells, we are going to use our `BinaryClassificationProcessor` to load in the data, and get everything ready for the tokenization step. Our goal is to create a list of tuples, `train_examples_for_processing`, and then run `convert_example_to_feature` for each item of it.

In [0]:
processor = BinaryClassificationProcessor()
train_examples = processor.get_train_examples(DATA_DIR)
# print(train_examples[:8])
# [<__main__.InputExample object at 0x000001568D222FD0>, <__main__.InputExample object at 0x00000156BF287D68>, <__main__.InputExample object at 0x00000156BF579630>, <__main__.InputExample object at 0x00000156BF5796A0>, <__main__.InputExample object at 0x00000156BF579710>, <__main__.InputExample object at 0x00000156BF579780>, <__main__.InputExample object at 0x00000156BF5797F0>, <__main__.InputExample object at 0x00000156BF579860>]

train_examples_len = len(train_examples)

In [0]:
# [0, 1] for binary classification
label_list = processor.get_labels() 
num_labels = len(label_list)

print(num_labels)

In [0]:
num_train_optimization_steps = int(train_examples_len / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS

In [0]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=False)

In [0]:
label_map = {label: i for i, label in enumerate(label_list)}
train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer, OUTPUT_MODE) for example in train_examples]

# train_examples_for_processing = train_examples_for_processing[:8]
# print(train_examples_for_processing)
# [(<__main__.InputExample object at 0x000001568D1E0C50>, {'0': 0, '1': 1}, 128, <transformers.tokenization_bert.BertTokenizer object at 0x00000156FF5976D8>, 'classification'), (<__main__.InputExample object at 0x000001568D1E0DA0>, {'0': 0, '1': 1}, 128, <transformers.tokenization_bert.BertTokenizer object at 0x00000156FF5976D8>, 'classification'), (<__main__.InputExample object at 0x00000156CC75EAC8>, {'0': 0, '1': 1}, 128, <transformers.tokenization_bert.BertTokenizer object at 0x00000156FF5976D8>, 'classification'), (<__main__.InputExample object at 0x00000156CC75EB38>, {'0': 0, '1': 1}, 128, <transformers.tokenization_bert.BertTokenizer object at 0x00000156FF5976D8>, 'classification'), (<__main__.InputExample object at 0x00000156CC75EBA8>, {'0': 0, '1': 1}, 128, <transformers.tokenization_bert.BertTokenizer object at 0x00000156FF5976D8>, 'classification'), (<__main__.InputExample object at 0x00000156CC75EC18>, {'0': 0, '1': 1}, 128, <transformers.tokenization_bert.BertTokenizer object at 0x00000156FF5976D8>, 'classification'), (<__main__.InputExample object at 0x00000156CC75EC88>, {'0': 0, '1': 1}, 128, <transformers.tokenization_bert.BertTokenizer object at 0x00000156FF5976D8>, 'classification'), (<__main__.InputExample object at 0x00000156CC75ECF8>, {'0': 0, '1': 1}, 128, <transformers.tokenization_bert.BertTokenizer object at 0x00000156FF5976D8>, 'classification')]

We set some variables that we'll use while training the model. Next, we loaded the pretrained tokenizer by BERT. In this case, we'll be using the `bert-base-cased` model.


The `convert_example_to_feature` function expects a tuple containing an example, the label map, the maximum sequence length, a tokenizer, and the output mode. So lastly, we will create an examples list ready to be processed (tokenized, truncated/padded, and turned into `InputFeatures`) by the `convert_example_to_feature` function.

In [0]:
start_time = time.time()

print(f'Converting {train_examples_len} examples: \n')

train_features = []
num_examples_processed = 0

for example in train_examples_for_processing:
    train_features.append(convert_example_to_feature(example))
    num_examples_processed += 1
    if (num_examples_processed % 10000 == 0):
        print(f"{num_examples_processed} examples have been processed.")

print("")
print("Finish in %s seconds." % (time.time() - start_time))
# print(train_features)
# [<__main__.InputFeatures object at 0x00000156C7ED0E48>, <__main__.InputFeatures object at 0x00000156BF2FD668>, <__main__.InputFeatures object at 0x00000156BF2D2860>, <__main__.InputFeatures object at 0x00000156C267F588>, <__main__.InputFeatures object at 0x00000156C7ED0A58>, <__main__.InputFeatures object at 0x00000156BF29E978>, <__main__.InputFeatures object at 0x00000156BF2D2D30>, <__main__.InputFeatures object at 0x00000156AAAD3A20>]

In [0]:
with open(DATA_DIR + "train_features.pkl", "wb") as f:
    pickle.dump(train_features, f)

Once all the examples are converted into features, we can pickle them to disk for safekeeping. Next time, you can just unpickle the file to get the list of features.

## Finetuning

In [0]:
# Load pre-trained model (weights)
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, cache_dir=CACHE_DIR, num_labels=num_labels)

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

HuggingFace's PyTorch implementation of BERT comes with a function that automatically downloads the BERT model for us. The model will be downloaded into a cache folder.

We just need to do a tiny bit more configuration for the training.

In [0]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

print(optimizer_grouped_parameters)

In [0]:
# `BertAdam` has been deprecated
# optimizer = BertAdam(optimizer_grouped_parameters,
#                      lr=LEARNING_RATE,
#                      warmup=WARMUP_PROPORTION,
#                      t_total=num_train_optimization_steps)
optimizer = AdamW(optimizer_grouped_parameters, 
                  lr=LEARNING_RATE,
                  correct_bias=False)
scheduler = WarmupLinearSchedule(optimizer, num_warmup_steps= NUM_WARMUP_STEPS,num_training_steps =num_train_optimization_steps)

print(optimizer)
print()
print(scheduler)

In [0]:
global_step = 0
nb_tr_steps = 0
tr_loss = 0

In [0]:
# train_features = pickle.load(open("datasets/yelp_review_polarity/train_features.pkl", "rb"))
with open(DATA_DIR + "train_features.pkl", "rb") as f:
    train_features = pickle.load(f)

print("***** Running training *****")
print("  Num examples = %d", train_examples_len)
print("  Batch size = %d", TRAIN_BATCH_SIZE)
print("  Num steps = %d", num_train_optimization_steps)

all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
print("\nInput ids:")
print(all_input_ids)

all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
print("\nInput mask:")
print(all_input_mask)

all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
print("\nSegment ids:")
print(all_segment_ids)

if OUTPUT_MODE == "classification":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
elif OUTPUT_MODE == "regression":
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
print("\nLabel ids:")
print(all_label_ids)

Setup our `DataLoader` for training.

In [0]:
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=TRAIN_BATCH_SIZE)

Train the model.

In [0]:
model.train()
for _ in tqdm(range(NUM_TRAIN_EPOCHS), desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        
        logits = model(input_ids, segment_ids, input_mask, labels=None)
        # print(logits)
        # (tensor([[-0.2749,  0.3726],
        #         [-0.0907,  0.1621],
        #         [-0.0432,  0.1179],
        #         [ 0.3068,  0.2338],
        #         [-0.0388,  0.1583],
        #         [ 0.8389,  0.0382],
        #         [-0.2859,  0.1343],
        #         [ 0.5700, -0.0209],
        #         [ 0.0744,  0.1734],
        #         [-0.0615,  0.1675],
        #         [ 0.2898,  0.2939],
        #         [ 0.2406,  0.1009]], device='cuda:0', grad_fn=<AddmmBackward>),)
        
        # print(logits[0])
        # tensor([[ 0.1675,  0.0322],
        #        [-0.3478,  0.2567],
        #        [ 0.1478, -0.0364],
        #        [-0.1464,  0.2940],
        #        [-0.0275,  0.2472],
        #        [ 0.1564,  0.2518],
        #        [ 0.1667,  0.0801],
        #        [ 0.7791,  0.0669],
        #        [-0.2284,  0.1228],
        #        [-0.1749,  0.1484],
        #        [-0.0431,  0.1213],
        #        [-0.0634, -0.0488]], device='cuda:0', grad_fn=<AddmmBackward>)
        
        if OUTPUT_MODE == "classification":
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits[0].view(-1, num_labels), label_ids.view(-1))
        elif OUTPUT_MODE == "regression":
            loss_fct = MSELoss()
            loss = loss_fct(logits[0].view(-1), label_ids.view(-1))

        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss / GRADIENT_ACCUMULATION_STEPS

        loss.backward()
        print("\r%f" % loss, end='')
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
            clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)  
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            global_step += 1

In [0]:
# Only save the model itself
model_to_save = model.module if hasattr(model, 'module') else model

# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME)
output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME)

torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(OUTPUT_DIR)

Now we've trained the BERT model for one epoch, we can evaluate the results. Of course, more training will likely yield better results but even one epoch should be sufficient for proof of concept.

Save the model, configuration file, and vocabulary.

## Evaluation
Most of the code for the evaluation is very similar to the training process, so I won’t go into too much detail but I’ll list some important points.

* `BERT_MODEL` parameter should be the name of your fine-tuned model.
* The tokenizer should be loaded from the vocabulary file created in the training stage. In our case, that would `outputs/yelp/vocab.txt`.
* This time, we'll be using the `BinaryClassificationProcessor` to load in the `dev.tsv` file by calling the `get_dev_examples` method.
* Double check to make sure you are loading the fine-tuned model and not the original BERT model. Frankly speaking, I did it and got surprised with a really bad result.

In [0]:
# Create the report directory; in this case, `yelp_evaluation_reports`
if os.path.exists(REPORTS_DIR) and os.listdir(REPORTS_DIR):
    REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
    os.makedirs(REPORTS_DIR)

if not os.path.exists(REPORTS_DIR):
    os.makedirs(REPORTS_DIR)
    REPORTS_DIR += f'/report_{len(os.listdir(REPORTS_DIR))}'
    os.makedirs(REPORTS_DIR)

In [0]:
def get_eval_report(task_name, labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    return {
        "task": task_name,
        "mcc": mcc,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn
    }

def compute_metrics(task_name, labels, preds):
    assert len(preds) == len(labels)
    return get_eval_report(task_name, labels, preds)

In [0]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained(OUTPUT_DIR + 'vocab.txt', do_lower_case=False)

In [0]:
processor = BinaryClassificationProcessor()
eval_examples = processor.get_dev_examples(DATA_DIR)
# [0, 1] for binary classification
label_list = processor.get_labels() 
num_labels = len(label_list)
eval_examples_len = len(eval_examples)

In [0]:
label_map = {label: i for i, label in enumerate(label_list)}
eval_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer, OUTPUT_MODE) for example in eval_examples]

In [0]:
start_time = time.time()

print(f'Converting {eval_examples_len} examples. \n')

eval_features = []

for example in eval_examples_for_processing[:10000]:
    eval_features.append(convert_example_to_feature(example))

print("Finish in %s seconds." % (time.time() - start_time))

In [0]:
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)

if OUTPUT_MODE == "classification":
    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
elif OUTPUT_MODE == "regression":
    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)

In [0]:
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=EVAL_BATCH_SIZE)

In [0]:
# Load pre-trained model (weights)
model = BertForSequenceClassification.from_pretrained(OUTPUT_DIR, num_labels=len(label_list))

In [0]:
model.to(device)

In [0]:
model.eval()
eval_loss = 0
nb_eval_steps = 0
preds = []

for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)

    with torch.no_grad():
        logits = model(input_ids, segment_ids, input_mask, labels=None)

    # create eval loss and other metric required by the task
    if OUTPUT_MODE == "classification":
        loss_fct = CrossEntropyLoss()
        tmp_eval_loss = loss_fct(logits[0].view(-1, num_labels), label_ids.view(-1))
    elif OUTPUT_MODE == "regression":
        loss_fct = MSELoss()
        tmp_eval_loss = loss_fct(logits[0].view(-1), label_ids.view(-1))

    eval_loss += tmp_eval_loss.mean().item()
    nb_eval_steps += 1
    if len(preds) == 0:
        preds.append(logits[0].detach().cpu().numpy())
    else:
        preds[0] = np.append(preds[0], logits[0].detach().cpu().numpy(), axis=0)
        
eval_loss = eval_loss / nb_eval_steps

# print("preds:", preds)
# preds: [array([[-0.07383163,  0.07358287],
#        [ 0.26556033, -0.2577362 ],
#        [-0.04184964, -0.05931795],
#        ...,
#        [ 0.2624964 , -0.11659782],
#        [ 0.6032646 , -0.3702591 ],
#        [ 0.6068794 , -0.34930488]], dtype=float32)]
preds = preds[0]
# print("preds:", preds)
# preds: [[-0.07383163  0.07358287]
#        [ 0.26556033 -0.2577362 ]
#        [-0.04184964 -0.05931795]
#        ...
#        [ 0.2624964  -0.11659782]
#        [ 0.6032646  -0.3702591 ]
#        [ 0.6068794  -0.34930488]]

if OUTPUT_MODE == "classification":
    preds = np.argmax(preds, axis=1)
elif OUTPUT_MODE == "regression":
    preds = np.squeeze(preds)
    
# print(preds)
# [1 0 0 ... 0 0 0]

# print(all_label_ids)
# tensor([1, 0, 1,  ..., 0, 1, 0])

# print(all_label_ids.numpy())
# [1 0 1 ... 0 1 0]

result = compute_metrics(TASK_NAME, all_label_ids.numpy(), preds)

result['eval_loss'] = eval_loss

output_eval_file = os.path.join(REPORTS_DIR, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in (result.keys()):
        print("  %s = %s" % (key, str(result[key])))
        writer.write("%s = %s\n" % (key, str(result[key])))

With just one single epoch of training, our BERT model achieves a $0.913$ Matthews correlation coefficient (good measure for evaluating unbalanced datasets according to `sklearn` doc [here](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)). With more training, and perhaps some hyperparameter tuning, we can almost certainly improve upon what is already an impressive score.

BERT is an incredibly powerful language representation model that shows great promise in a wide variety of NLP tasks.