# AMAZON REVIEW CLASSIFICATION

In this notebook we are going to implement a review classification model using Bert model. pretrained Bert models are available
on tfhub modules and can also be implemented using Hugging face library.
In this notebook we are going to use Hugging face library.

The dataset can be downloaded using this link https://www.kaggle.com/bittlingmayer/amazonreviews

In [None]:
#importing relevant libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import *
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import  Model
from tensorflow.keras.layers import Dense, Input, Dropout
from sklearn.utils import shuffle
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler

In [None]:
MAX_SEQUENCE_LENGTH = 300
TRAIN_BATCH_SIZE = 16
TEST_BATCH_SIZE = 64
truncation_strategy = 'longest_first' #the tokenizer truncates the longer sentence first
Tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The dataset consists of 4 million review with 90% of them for training and the rest is for testing

In [None]:
with open('./train', 'r', encoding="utf8") as in_file:
    train = in_file.readlines()

with open('./test', encoding="utf8") as in_file:
    test = in_file.readlines()

In [None]:
#helper functions to prepare the data

def split_from_label(line):
    return ' '.join(line.split(' ')[1:])

def map_label(row):
    if row == '__label__1':
        return 0
    else:
        return 1 

Each record in the dataset composed of three parts.

1 - The label in the form _ _ _label_ _ _  + 1 or 2 for good or bad reviews

2 - the head of the review ending with :

3 - the body of the review

The following function takes the text file object and returns a data frame with a separate column for each part

In [None]:
def prepare_dataset(text_file):
    df = pd.DataFrame(text_file, columns = ['original_text'])
    df['head'] = df['original_text'].str.split(':').str.get(0).map(split_from_label).astype('str')
    df['body'] = df['original_text'].str.split(':').str.get(1).astype('str')
    df['label'] = df['original_text'].str.split(':').str.get(0).str.extract(r'(__label__[1-2])').astype('str')
    df['label'] = df['label'].map(map_label)
    df.drop('original_text',1,inplace = True)
    df = shuffle(df)
    return df

train_dataset = prepare_dataset(train)
test_dataset = prepare_dataset(test)

Bert tokenizer has its own way of tokenization, It takes a sentence and optionally another sentence and converts them into an input suitable for Bert model.

The output of tokenizer encoding method is a dictionary contains the following:

1 - the indices of the words of the two sentences separated by [SEP] token and before them the indix of the [CLS] token (in case 'add_special_tokens' argument is set to True)

2 - binary array indicating the non padded part of the sequence needed for attention (1 for non padded and 0 for padded)

3 - another binary array indicating which part of the total sequence belongs to sentence 1 and which part belongs to sequence 2

In [None]:
#function to convert text into suitable Bert input form
def get_model_inputs(str1, str2, _truncation_strategy, length, tokenizer, pad_seq = True):

    inputs = tokenizer.encode_plus(str1,
                                    str2,
                                    add_special_tokens=True,
                                    max_length=length,
                                    truncation_strategy=_truncation_strategy,
                                    pad_to_max_length=pad_seq)

    input_ids = inputs["input_ids"]
    input_masks = inputs["attention_mask"]
    input_segments = inputs["token_type_ids"]
    return [input_ids, input_masks, input_segments]

In [None]:
#custom data generator to feed the data to the odel batch by batch
def datagen(dataset, batch_size):
  data = dataset.copy()
  while True:
    for i in range(1):
      inputs_x_id, inputs_x_mask, inputs_x_segment, inputs_y = [], [], [], []
      start = i*batch_size
      end = start+batch_size
      batch_x = data.iloc[start:end,[0, 1]].values
      batch_y = data.iloc[start:end,2].values
      for i in range(batch_size):
        input_ids, input_masks, input_segments = get_model_inputs(batch_x[i,0], batch_x[i,1], 
                                                                  truncation_strategy, 
                                                                  MAX_SEQUENCE_LENGTH, 
                                                                  Tokenizer)
        
        inputs_x_id.append(input_ids)
        inputs_x_mask.append(input_masks)
        inputs_x_segment.append(input_segments)
        inputs_y.append(batch_y[i])

      yield ([np.array(inputs_x_id, dtype=np.int32),
             np.array(inputs_x_mask, dtype=np.int32),
             np.array(inputs_x_segment, dtype=np.int32)],
             np.array(inputs_y, dtype=np.int32))

K.clear_session()
train_data_generator = datagen(train_dataset, TRAIN_BATCH_SIZE)
test_data_generator = datagen(test_dataset, TEST_BATCH_SIZE)

The model takes the output hidden state of the last layer for all tokens, average pooling them then to a dense layer

You can edit the configurations the pass the edited dictionary to the TFBertModel

In [None]:
def create_model():
    input_id = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    
    input_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    
    input_atn = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32)
    
    config = BertConfig() 
    config.output_hidden_states = False # Set to True to obtain hidden states
    
    bert_model = TFBertModel.from_pretrained('bert-base-uncased', config=config)
    
    # if config.output_hidden_states = True, obtain hidden states via bert_model(...)[-1]
    input_embedding = bert_model(input_id, attention_mask=input_mask, token_type_ids=input_atn)[0]

    # Get average tokens output
    tokens_embedding = tf.keras.layers.GlobalAveragePooling1D()(input_embedding)
    
    x = tf.keras.layers.Dense(128, activation='relu')(token_embedding)
        
    x = tf.keras.layers.Dropout(0.2)(x)

    x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    
    model = tf.keras.models.Model(inputs=[input_id, input_mask, input_atn], outputs=x)
    
    return model

bert = create_model();

In [None]:
#loss function (modified binary cross entropy loss function which gives higher attention to misclassified examples)
def focal_loss(y_true, y_pred, gamma=2., alpha=.25):
    pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
    pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
    return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1))-K.sum((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0))

In [None]:
early_stopping = EarlyStopping(monitor= 'val_acc', 
                               mode = 'max',
                               patience=30, 
                               verbose=1)

model_checkpoint = ModelCheckpoint('BERT_MODEL_AMAZON_REVIEW_CLASSIFIER',
                                   monitor = 'val_acc', 
                                   mode = 'max', 
                                   save_best_only=True, 
                                   verbose=1)

reduce_lr = ReduceLROnPlateau(monitor='val_acc', 
                              mode = 'max',
                              factor=0.2, 
                              patience=4, 
                              min_lr=0.0000001, 
                              verbose=1)
opt = Adam(lr = 0.0005)
bert.compile(loss = focal_loss, optimizer= opt, metrics=['acc'])

In [None]:
#train the model
history = bert.fit_generator(generator=train_data_generator,
                    validation_data=test_data_generator,
                    steps_per_epoch = len(train_dataset)//TRAIN_BATCH_SIZE,
                    validation_steps = len(test_dataset)//TEST_BATCH_SIZE,
                    epochs = 200,
                    callbacks = [early_stopping, model_checkpoint, reduce_lr])