*Copyright (c) Microsoft Corporation. All rights reserved.*

*Licensed under the MIT License.*

# Sentiment analysis using BERT

In [5]:
# This code is obtained from the Text classification notebook
import sys
sys.path.append("../../")
import os
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from utils_nlp.dataset.multinli import load_pandas_df
from utils_nlp.eval.classification import eval_classification
from utils_nlp.models.bert.sequence_classification import BERTSequenceClassifier
from utils_nlp.models.bert.common import Language, Tokenizer
from utils_nlp.common.timer import Timer
import torch
import torch.nn as nn
import numpy as np

In this notebook, we are trying to follow the guideline from the [text-classification notebook](tc_mnli_bert.ipynb) to perform fine-tuning BERT model in order to do sentiment analysis. The input we are using are dataset [IMDB Large movie reviews](http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz) hosted by Stanford. The following data pre-processing is obtained from Google Research jupyter notebook on the same purpose of doing Sentiment Analysis on Google Colab.

In [39]:
DATA_FOLDER = "../../../temp"
BERT_CACHE_DIR = "../../../temp"
LANGUAGE = Language.ENGLISH
TO_LOWER = True
MAX_LEN = 150
BATCH_SIZE = 32
NUM_GPUS = 2
NUM_EPOCHS = 1
TRAIN_SIZE = 0.6
LABEL_COL = "polarity"
TEXT_COL = "sentence"

## Data processing

In [7]:
import tensorflow as tf
from tensorflow import keras
import os
import re

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)

  print("====> Complete downloading")
  
  train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
    
  print("===> Complete train df")

  test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))
  print("===> Complete test df")
  
  return train_df, test_df


In [8]:
# Get the dataset and save it into pandas Dataframe
train, test = download_and_load_datasets()

====> Complete downloading
===> Complete train df
===> Complete test df


Keep in mind that this dataset contains movie reviews (as a whole) instead of individual sentences. Therefore, in the case of BERT, we should note that a single input in this case is a paragraph with potentially multiple sentences, compare to the original version where each input are a single sentence.

In [37]:
train.head(5)

Unnamed: 0,sentence,sentiment,polarity
0,"I am not a very good writer, so I'll keep this...",10,1
1,That is the only thing I can positive to say a...,2,0
2,Writer-director Tony Piccirillo adapted his ow...,4,0
3,A question for all you girls out there : If a ...,3,0
4,Sorry. Someone has to say it. This really is/w...,1,0


In [30]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
sentence     25000 non-null object
sentiment    25000 non-null object
polarity     25000 non-null int64
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


The dataset is divided equally into 2 group of polarity, with 1 is positive review and 0 is negative review. The sentiment value are more detailed about the actual reaction rate, but we will experiemtn with it later.

In [36]:
train.polarity.value_counts()

1    12500
0    12500
Name: polarity, dtype: int64

In [75]:
# Take a sample of the data
df_train = train.sample(5000)
df_test = test.sample(5000)

We encode the class labels to make sure that we know which is which

In [76]:
# encode labels
label_encoder = LabelEncoder()
labels_train = label_encoder.fit_transform(df_train[LABEL_COL])
labels_test = label_encoder.transform(df_test[LABEL_COL])

num_labels = len(np.unique(labels_train))

In [77]:
print("Number of unique labels: {}".format(num_labels))
print("Number of training examples: {}".format(df_train.shape[0]))
print("Number of testing examples: {}".format(df_test.shape[0]))

Number of unique labels: 2
Number of training examples: 5000
Number of testing examples: 5000


## Tokenize and Preprocess

Before training, we tokenize the text documents and convert them to lists of tokens. The following steps instantiate a BERT tokenizer given the language, and tokenize the text of the training and testing sets.

In [78]:
tokenizer = Tokenizer(LANGUAGE, to_lower=TO_LOWER, cache_dir=BERT_CACHE_DIR)

tokens_train = tokenizer.tokenize(list(df_train[TEXT_COL]))
tokens_test = tokenizer.tokenize(list(df_test[TEXT_COL]))






  0%|                                                                                         | 0/5000 [00:00<?, ?it/s]




  1%|▍                                                                              | 28/5000 [00:00<00:18, 272.47it/s]




  1%|█                                                                              | 68/5000 [00:00<00:16, 292.49it/s]




  2%|█▌                                                                             | 97/5000 [00:00<00:16, 291.25it/s]




  3%|█▉                                                                            | 126/5000 [00:00<00:17, 278.42it/s]




  3%|██▌                                                                           | 164/5000 [00:00<00:16, 297.84it/s]




  4%|██▉                                                                           | 190/5000 [00:00<00:16, 285.34it/s]




  4%|███▎                                                                          | 216/5000 [00:00<00:17, 277.23it/s]




  5

 42%|████████████████████████████████                                             | 2079/5000 [00:07<00:10, 284.52it/s]




 42%|████████████████████████████████▍                                            | 2109/5000 [00:07<00:10, 288.73it/s]




 43%|████████████████████████████████▉                                            | 2140/5000 [00:07<00:09, 294.73it/s]




 43%|█████████████████████████████████▍                                           | 2170/5000 [00:07<00:10, 281.15it/s]




 44%|██████████████████████████████████                                           | 2210/5000 [00:07<00:09, 299.41it/s]




 45%|██████████████████████████████████▌                                          | 2243/5000 [00:07<00:09, 302.17it/s]




 45%|███████████████████████████████████                                          | 2274/5000 [00:07<00:09, 284.09it/s]




 46%|███████████████████████████████████▍                                         | 2305/5000 [00:07<00:09, 286.02it/s]




 47%|███

 81%|██████████████████████████████████████████████████████████████▋              | 4074/5000 [00:14<00:03, 261.79it/s]




 82%|███████████████████████████████████████████████████████████████▏             | 4103/5000 [00:14<00:03, 258.40it/s]




 83%|███████████████████████████████████████████████████████████████▌             | 4129/5000 [00:14<00:03, 238.62it/s]




 83%|████████████████████████████████████████████████████████████████             | 4159/5000 [00:14<00:03, 246.63it/s]




 84%|████████████████████████████████████████████████████████████████▍            | 4184/5000 [00:14<00:03, 242.95it/s]




 84%|████████████████████████████████████████████████████████████████▊            | 4209/5000 [00:14<00:03, 244.79it/s]




 85%|█████████████████████████████████████████████████████████████████▎           | 4240/5000 [00:15<00:03, 251.34it/s]




 85%|█████████████████████████████████████████████████████████████████▋           | 4266/5000 [00:15<00:02, 253.62it/s]




 86%|███

 20%|███████████████▌                                                             | 1008/5000 [00:03<00:13, 305.19it/s]




 21%|████████████████                                                             | 1040/5000 [00:03<00:12, 304.76it/s]




 21%|████████████████▌                                                            | 1074/5000 [00:03<00:12, 314.19it/s]




 22%|█████████████████                                                            | 1111/5000 [00:03<00:11, 327.23it/s]




 23%|█████████████████▋                                                           | 1145/5000 [00:03<00:11, 328.64it/s]




 24%|██████████████████▏                                                          | 1179/5000 [00:04<00:11, 327.86it/s]




 24%|██████████████████▋                                                          | 1212/5000 [00:04<00:11, 316.59it/s]




 25%|███████████████████▏                                                         | 1244/5000 [00:04<00:12, 296.42it/s]




 26%|███

 64%|█████████████████████████████████████████████████▏                           | 3197/5000 [00:10<00:06, 299.32it/s]




 65%|█████████████████████████████████████████████████▋                           | 3228/5000 [00:10<00:06, 294.05it/s]




 65%|██████████████████████████████████████████████████▏                          | 3258/5000 [00:10<00:05, 290.51it/s]




 66%|██████████████████████████████████████████████████▋                          | 3289/5000 [00:11<00:05, 296.03it/s]




 66%|███████████████████████████████████████████████████▏                         | 3321/5000 [00:11<00:05, 299.65it/s]




 67%|███████████████████████████████████████████████████▋                         | 3354/5000 [00:11<00:05, 303.02it/s]




 68%|████████████████████████████████████████████████████▏                        | 3386/5000 [00:11<00:05, 304.53it/s]




 68%|████████████████████████████████████████████████████▋                        | 3422/5000 [00:11<00:05, 311.67it/s]




 69%|███

In addition, we perform the following preprocessing steps in the cell below:
- Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary
- Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence
- Pad or truncate the token lists to the specified max length
- Return mask lists that indicate paddings' positions
- Return token type id lists that indicate which sentence the tokens belong to (not needed for one-sequence classification)

*See the original [implementation](https://github.com/google-research/bert/blob/master/run_classifier.py) for more information on BERT's input format.*

In [79]:
tokens_train, mask_train, _ = tokenizer.preprocess_classification_tokens(
    tokens_train, MAX_LEN
)
tokens_test, mask_test, _ = tokenizer.preprocess_classification_tokens(
    tokens_test, MAX_LEN
)

## Create Model
Next, we create a sequence classifier that loads a pre-trained BERT model, given the language and number of labels.

In [80]:
classifier = BERTSequenceClassifier(
    language=LANGUAGE, num_labels=num_labels, cache_dir=BERT_CACHE_DIR
)

## Train
We train the classifier using the training examples. This involves fine-tuning the BERT Transformer and learning a linear classification layer on top of that:

In [81]:
with Timer() as t:
    classifier.fit(
        token_ids=tokens_train,
        input_mask=mask_train,
        labels=labels_train,    
        num_gpus=NUM_GPUS,        
        num_epochs=NUM_EPOCHS,
        batch_size=BATCH_SIZE,    
        verbose=True,
    )    
print("[Training time: {:.3f} hrs]".format(t.interval / 3600))

t_total value of -1 results in schedule not being applied


epoch:1/1; batch:1->16/156; loss:0.696118
epoch:1/1; batch:17->32/156; loss:0.362434
epoch:1/1; batch:33->48/156; loss:0.358708
epoch:1/1; batch:49->64/156; loss:0.350601
epoch:1/1; batch:65->80/156; loss:0.397891
epoch:1/1; batch:81->96/156; loss:0.483194
epoch:1/1; batch:97->112/156; loss:0.236887
epoch:1/1; batch:113->128/156; loss:0.300213
epoch:1/1; batch:129->144/156; loss:0.511190
epoch:1/1; batch:145->156/156; loss:0.113811
[Training time: 1.165 hrs]


## Score
We score the test set using the trained classifier:

In [82]:
preds = classifier.predict(
    token_ids=tokens_test, input_mask=mask_test, num_gpus=NUM_GPUS, batch_size=BATCH_SIZE
)






  0%|                                                                                         | 0/5000 [00:00<?, ?it/s]




  1%|▌                                                                               | 32/5000 [00:09<23:24,  3.54it/s]




  1%|█                                                                               | 64/5000 [00:17<23:05,  3.56it/s]




  2%|█▌                                                                              | 96/5000 [00:26<22:52,  3.57it/s]




  3%|██                                                                             | 128/5000 [00:35<22:47,  3.56it/s]




  3%|██▌                                                                            | 160/5000 [00:44<22:45,  3.55it/s]




  4%|███                                                                            | 192/5000 [00:53<22:30,  3.56it/s]




  4%|███▌                                                                           | 224/5000 [01:02<22:25,  3.55it/s]




  5

 42%|████████████████████████████████▉                                             | 2112/5000 [09:52<13:47,  3.49it/s]




 43%|█████████████████████████████████▍                                            | 2144/5000 [10:01<13:36,  3.50it/s]




 44%|█████████████████████████████████▉                                            | 2176/5000 [10:10<13:22,  3.52it/s]




 44%|██████████████████████████████████▍                                           | 2208/5000 [10:20<13:23,  3.48it/s]




 45%|██████████████████████████████████▉                                           | 2240/5000 [10:29<13:06,  3.51it/s]




 45%|███████████████████████████████████▍                                          | 2272/5000 [10:38<12:57,  3.51it/s]




 46%|███████████████████████████████████▉                                          | 2304/5000 [10:47<12:46,  3.52it/s]




 47%|████████████████████████████████████▍                                         | 2336/5000 [10:56<12:40,  3.50it/s]




 47%|███

 84%|█████████████████████████████████████████████████████████████████▉            | 4224/5000 [19:48<03:38,  3.55it/s]




 85%|██████████████████████████████████████████████████████████████████▍           | 4256/5000 [19:57<03:30,  3.53it/s]




 86%|██████████████████████████████████████████████████████████████████▉           | 4288/5000 [20:06<03:22,  3.52it/s]




 86%|███████████████████████████████████████████████████████████████████▍          | 4320/5000 [20:15<03:13,  3.51it/s]




 87%|███████████████████████████████████████████████████████████████████▉          | 4352/5000 [20:24<03:05,  3.49it/s]




 88%|████████████████████████████████████████████████████████████████████▍         | 4384/5000 [20:33<02:55,  3.50it/s]




 88%|████████████████████████████████████████████████████████████████████▉         | 4416/5000 [20:42<02:44,  3.54it/s]




 89%|█████████████████████████████████████████████████████████████████████▍        | 4448/5000 [20:51<02:35,  3.56it/s]




 90%|███

## Evaluate Results
Finally, we compute the accuracy, precision, recall, and F1 metrics of the evaluation on the test set.

In [83]:
print(classification_report(labels_test, preds, target_names=["negative", "positive"]))

              precision    recall  f1-score   support

    negative       0.82      0.92      0.87      2547
    positive       0.90      0.80      0.85      2453

   micro avg       0.86      0.86      0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000

