*Copyright (c) Microsoft Corporation. All rights reserved.*

*Licensed under the MIT License.*

# Sentiment analysis using BERT

In [1]:
# This code is obtained from the Text classification notebook
import sys
sys.path.append("../../")
import os
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from utils_nlp.dataset.multinli import load_pandas_df
from utils_nlp.eval.classification import eval_classification
from utils_nlp.models.bert.sequence_classification import BERTSequenceClassifier
from utils_nlp.models.bert.common import Language, Tokenizer
from utils_nlp.common.timer import Timer
import torch
import torch.nn as nn
import numpy as np

In this notebook, we follow along with the [text-classification notebook](tc_mnli_bert.ipynb) to fine-tune BERT to perform sentiment analysis on the IMDB dataset [IMDB Large movie reviews](http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz) hosted by Stanford. The following data pre-processing is obtained from Google Research's example notebook for fine-tuning BERT.

In [2]:
DATA_FOLDER = "../../../temp"
BERT_CACHE_DIR = "../../../temp"
LANGUAGE = Language.ENGLISH
TO_LOWER = True
MAX_LEN = 150
BATCH_SIZE = 32
NUM_GPUS = 2
NUM_EPOCHS = 1
TRAIN_SIZE = 0.6
LABEL_COL = "polarity"
TEXT_COL = "sentence"

## Data processing

In [14]:
from utils_nlp.dataset.url_utils import maybe_download
import tarfile
from tqdm import tqdm_notebook as tqdm
import re


In [25]:
# Load the data from a directory

# Download the dataset and load into pandas dataframe
def download_or_find(url, directory=".", filename="aclImdb.tar.gz"):
    """
    Maybe download the data and put it into the given directory with given filename.
    Skip the downloading if file already existed.
    
    Load the data into pandas Dataframe
    Args:
        url (string): The URL of the dataset
        directory (string): Where to look for or store the dataset, default to current directory
        filename (string): What filename to use for retrieve or store the dataset
    
    Return:
        file_path (string): The file_path of the downloaded (or currently exists)
    """
    print("=====> Begin downloading")
    file_path = maybe_download(url, filename, directory)
    print("=====> Done downloading")
    
    data_path = os.path.join(os.getcwd(), directory, "aclImdb")
    
    # Extract the data to the data folder
    if not os.path.exists(data_path):
        tar = tarfile.open(file_path)
        tar.extractall(directory)
        tar.close()
    
    # Return the path of dataset when done 
    print("=====> Finish extracting")
    return data_path
    

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
    """
    Method to go through all the files in the directory, get its content and put it into the appropriate train/test
    """
    
    # Create a new dictionary to store initial value for dataframe
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    
    # Loop through all the subdirectories
    for file_path in tqdm(os.listdir(directory)):
        # Open each file
        with open(os.path.join(directory, file_path), "r", encoding="utf8") as f:
            # Each file in the directory will be a text file (.txt) containing a review
            data["sentence"].append(f.read())
            # The name of the file has 2 parts, the index of the file and the sentiment value of that review
            # We only interested in the sentiment value, so only group(1) in the Match object
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    
    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    print("===> Directory: {}".format(directory))
    # Load the positive and negative data to pandas Dataframe
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    
    # Denoted positive to be 1 and negative be 0 for classification label
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    
    URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    
    dataset_path = download_or_find(URL, directory="data")

    print("=============> Complete downloading")
    print("**** Dataset path: {}".format(dataset_path))
  
    train_df = load_dataset(os.path.join(dataset_path, "train"))
    
    print("===> Complete train df")

    test_df = load_dataset(os.path.join(dataset_path, "test"))
    print("===> Complete test df")
  
    return train_df, test_df


In [26]:
# Get the dataset and save it into pandas Dataframe
train, test = download_and_load_datasets()

=====> Begin downloading
=====> Done downloading
=====> Finish extracting
**** Dataset path: C:\Users\ducl\Documents\GitHub\nlp-2\scenarios\text_classification\data\aclImdb
===> Directory: C:\Users\ducl\Documents\GitHub\nlp-2\scenarios\text_classification\data\aclImdb\train


HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))


===> Complete train df
===> Directory: C:\Users\ducl\Documents\GitHub\nlp-2\scenarios\text_classification\data\aclImdb\test


HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12500), HTML(value='')))


===> Complete test df


Keep in mind that this dataset contains movie reviews (as a whole) instead of individual sentences. Therefore, in the case of BERT, we should note that a single input in this case is a paragraph with potentially multiple sentences, compare to the original version where each input are a single sentence.

In [None]:
train.head(5)

In [None]:
train.info()

The dataset is divided equally into 2 group of polarity, with 1 is positive review and 0 is negative review. The sentiment value are more detailed about the actual reaction rate, but we will experiemtn with it later.

In [None]:
train.polarity.value_counts()

In [None]:
# Take a sample of the data
df_train = train.sample(5000)
df_test = test.sample(5000)

We encode the class labels to make sure that we know which is which

In [None]:
# encode labels
label_encoder = LabelEncoder()
labels_train = label_encoder.fit_transform(df_train[LABEL_COL])
labels_test = label_encoder.transform(df_test[LABEL_COL])

num_labels = len(np.unique(labels_train))

In [None]:
print("Number of unique labels: {}".format(num_labels))
print("Number of training examples: {}".format(df_train.shape[0]))
print("Number of testing examples: {}".format(df_test.shape[0]))

## Tokenize and Preprocess

Before training, we need to transform the text data into a format that BERT understands. This process involves two steps. First, we instantiate a BERT tokenizer with a given language and then tokenize the text of the training and testing sets.

In [None]:
tokenizer = Tokenizer(LANGUAGE, to_lower=TO_LOWER, cache_dir=BERT_CACHE_DIR)

tokens_train = tokenizer.tokenize(list(df_train[TEXT_COL]))
tokens_test = tokenizer.tokenize(list(df_test[TEXT_COL]))

Second, we perform the following preprocessing steps in the cell below:
- Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary
- Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence
- Pad or truncate the token lists to the specified max length
- Return mask lists that indicate paddings' positions
- Return token type id lists that indicate which sentence the tokens belong to (not needed for one-sequence classification)

*See the original [implementation](https://github.com/google-research/bert/blob/master/run_classifier.py) for more information on BERT's input format.*

In [79]:
tokens_train, mask_train, _ = tokenizer.preprocess_classification_tokens(
    tokens_train, MAX_LEN
)
tokens_test, mask_test, _ = tokenizer.preprocess_classification_tokens(
    tokens_test, MAX_LEN
)

## Create Model
Next, we create a sequence classifier that loads a pre-trained BERT model, given the language and number of labels.

In [80]:
classifier = BERTSequenceClassifier(
    language=LANGUAGE, num_labels=num_labels, cache_dir=BERT_CACHE_DIR
)

## Train
We train the classifier using the training examples. This involves fine-tuning Hugging Face's PyTorch implementation of a pre-trained BERT transformer with a linear classifier layer attached to perform sequence classification tasks. The arguments for fitting the classifier are:
- token_ids: list of token indices
- input_mask: mask lists that indicate paddings' position
- labels: list of training labels
- num_gpus: number of GPUs. If none specified, all available GPUs will be used
- num_epochs: number of training epochs (default 1)
- batch_size: training batch size (default 32)
- verbose: displays training progress and loss values

In [81]:
with Timer() as t:
    classifier.fit(
        token_ids=tokens_train,
        input_mask=mask_train,
        labels=labels_train,    
        num_gpus=NUM_GPUS,        
        num_epochs=NUM_EPOCHS,
        batch_size=BATCH_SIZE,    
        verbose=True,
    )    
print("[Training time: {:.3f} hrs]".format(t.interval / 3600))

t_total value of -1 results in schedule not being applied


epoch:1/1; batch:1->16/156; loss:0.696118
epoch:1/1; batch:17->32/156; loss:0.362434
epoch:1/1; batch:33->48/156; loss:0.358708
epoch:1/1; batch:49->64/156; loss:0.350601
epoch:1/1; batch:65->80/156; loss:0.397891
epoch:1/1; batch:81->96/156; loss:0.483194
epoch:1/1; batch:97->112/156; loss:0.236887
epoch:1/1; batch:113->128/156; loss:0.300213
epoch:1/1; batch:129->144/156; loss:0.511190
epoch:1/1; batch:145->156/156; loss:0.113811
[Training time: 1.165 hrs]


## Score
We score the test set using the trained classifier:

In [82]:
preds = classifier.predict(
    token_ids=tokens_test, input_mask=mask_test, num_gpus=NUM_GPUS, batch_size=BATCH_SIZE
)






  0%|                                                                                         | 0/5000 [00:00<?, ?it/s]




  1%|▌                                                                               | 32/5000 [00:09<23:24,  3.54it/s]




  1%|█                                                                               | 64/5000 [00:17<23:05,  3.56it/s]




  2%|█▌                                                                              | 96/5000 [00:26<22:52,  3.57it/s]




  3%|██                                                                             | 128/5000 [00:35<22:47,  3.56it/s]




  3%|██▌                                                                            | 160/5000 [00:44<22:45,  3.55it/s]




  4%|███                                                                            | 192/5000 [00:53<22:30,  3.56it/s]




  4%|███▌                                                                           | 224/5000 [01:02<22:25,  3.55it/s]




  5

 84%|█████████████████████████████████████████████████████████████████▉            | 4224/5000 [19:48<03:38,  3.55it/s]




 85%|██████████████████████████████████████████████████████████████████▍           | 4256/5000 [19:57<03:30,  3.53it/s]




 86%|██████████████████████████████████████████████████████████████████▉           | 4288/5000 [20:06<03:22,  3.52it/s]




 86%|███████████████████████████████████████████████████████████████████▍          | 4320/5000 [20:15<03:13,  3.51it/s]




 87%|███████████████████████████████████████████████████████████████████▉          | 4352/5000 [20:24<03:05,  3.49it/s]




 88%|████████████████████████████████████████████████████████████████████▍         | 4384/5000 [20:33<02:55,  3.50it/s]




 88%|████████████████████████████████████████████████████████████████████▉         | 4416/5000 [20:42<02:44,  3.54it/s]




 89%|█████████████████████████████████████████████████████████████████████▍        | 4448/5000 [20:51<02:35,  3.56it/s]




 90%|███

## Evaluate Results
Finally, we compute the accuracy, precision, recall, and F1 metrics of the evaluation on the test set.

In [None]:
print(classification_report(labels_test, preds, target_names=["negative", "positive"]))