## Sentiment analysis with BERT

In [2]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

In [5]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

In [7]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv("d:\\Temp\\training.1600000.processed.noemoticon.csv", 
                   header=None, names=cols, engine="python", encoding="latin1"
)

In [8]:
data.drop(["id", "date", "query", "user"], axis=1, inplace=True)

In [9]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [15]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text() # clear LXML
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet) # clear all refs
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    tweet = re.sub(r"[^a-zA-Z.!?]", ' ', tweet)
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [16]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [17]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization

In [19]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [72]:
def encode_sentence(sent):
    return ["[CLS]"] + tokenizer.tokenize(sent) + ["[SEP]"]

In [73]:
data_inputs = [encode_sentence(sent) for sent in data_clean]

In [21]:
tokenizer.tokenize("My dog loves strawberries.")

['my', 'dog', 'loves', 'straw', '##berries', '.']

In [23]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("My dog loves strawberries."))

[2026, 3899, 7459, 13137, 20968, 1012]

In [24]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [26]:
data_inputs = [encode_sentence(s) for s in data_clean]

### Dataset creation

In [30]:
data_with_len = [[sent, data_labels[i], len(sent)] for i, sent in enumerate(data_inputs)]

In [31]:
random.shuffle(data_with_len)

In [32]:
data_with_len.sort(key = lambda x : x[2]) # sort by length

In [34]:
sorted_all = [(sent_lab[0], sent_lab[1]) for sent_lab in data_with_len if sent_lab[2] > 7]

In [74]:
def get_ids(tokens):
    return tokenizer.convert_tokens_to_idskens_to_ids(tokens)

def get_mask(tokens):
    return np.char.not_equal(tokens, "[PAD]").astype(int)

def get_segments(tokens):
    seg_ids = []
    current_seg_id = 0
    for t in tokens:
        seg_ids.append(current_seg_id)
        if tok == "[SEP]":
            current_current_seg_idid = 1-current_seg_id
    return seg_ids

In [36]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all, output_types=(tf.int32, tf.int32))

In [37]:
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=array([2339, 2515, 2002, 2191, 2033, 2061, 3407, 1029])>,
 <tf.Tensor: shape=(), dtype=int32, numpy=1>)

In [39]:
BATCH_SIZE=32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [41]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 2339,  2515,  2002,  2191,  2033,  2061,  3407,  1029],
        [26478,  8609,  2015,  2006,  2115, 16568,  2663,   999],
        [ 4658,  1012,  5580,  2057,  2071,  2022,  1997,  2393],
        [ 1053, 28765, 14841,  3211,  3347,  2059, 14175,  2252],
        [ 2975,  2005,  2147,  2574,  1012,  1012,  1012,  1012],
        [ 2821,  2158,  1045,  2514,  2066,  5996,  2157,  2085],
        [ 1045,  2031,  1059, 16584, 27571,  2035,  7840,  2033],
        [ 2383,  1037,  2645,  2007,  1996, 15041,  2919,  2335],
        [ 2667,  2000,  8980,  1012,  2067,  2000,  2147,  4826],
        [10166,  2008,  2015,  2428,  1012,  1012,  1012,  8680],
        [27571,  7716,  7630,  3600,  2115, 12171,  6429, 13871],
        [ 2067,  2013,  1996,  2009, 11360,  1012, 26304,  2009],
        [ 2292,  2026,  2166,  3385,  2290,  6170,  2000,  2017],
        [11082,  3246,  1052,  7274,  2097,  2393,  2149,  2574],
        [ 2003,  3110,  2074

In [42]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

## Model Building

In [60]:
class DCNN(tf.keras.Model):
    def __init__(self, vocab_size, emb_dim=128, nb_filters=50,
                 FFN_units=512, nb_classes=2, dropout_rate=0.1, training=False, name="dcnn"):
        super(DCNN, self).__init__(name=name)
        self.embedding = layers.Embedding(vocab_size, emb_dim)
        
        # here we are shifting filter in just 1D, as it makes no sense to do 2D for text
        self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, padding="valid", activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters, kernel_size=3, padding="valid", activation="relu")  
        self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size=4, padding="valid", activation="relu")  
        
        self.pool = layers.GlobalAveragePooling1D()
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1, activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes, activation="softmax")
            
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3*nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output
        

## Training

In [46]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [61]:
dcnn = DCNN(vocab_size=VOCAB_SIZE, emb_dim=EMB_DIM, nb_filters=NB_FILTERS, 
            FFN_units=FFN_UNITS, nb_classes=NB_CLASSES, dropout_rate=DROPOUT_RATE)

In [62]:
if NB_CLASSES == 2:
    dcnn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
else:
    dcnn.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["sparse_categorical_accuracy"])

In [63]:
checkpoint_path = "ckpt_bert_tok"
ckpt = tf.train.Checkpoint(DCNN=dcnn)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint has been resored")

In [64]:
class SaveCheckpointOnEpochEndCallback(tf.keras.callbacks.Callback):
    
    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("\nCheckpoint saved at {}".format(checkpoint_path))

In [65]:
dcnn.fit(train_dataset, epochs=NB_EPOCHS, callbacks=[SaveCheckpointOnEpochEndCallback()])

Epoch 1/5
  36913/Unknown - 426s 11ms/step - loss: 0.4277 - accuracy: 0.8018Checkpoint saved at ckpt_bert_tok
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2b35f1961d0>

## Evaluation

In [67]:
results = dcnn.evaluate(test_dataset)
print(results)

[0.93369060754776, 0.8184893727302551]


In [69]:
def get_prediction(sentense):
    tokens = encode_sentence(sentense)
    inputs = tf.expand_dims(tokens, 0)
    output = dcnn(inputs, training=False)
    sentiment = math.floor(output*2)
    
    if sentiment == 0:
        print("Output of the model: {}\nPredicted sentiment: negative".format(output))
    else:
        print("Output of the model: {}\nPredicted sentiment: positive".format(output))

In [70]:
get_prediction("I would rather not do it again!")

Output of the model: [[0.00115073]]
Predicted sentiment: negative


In [71]:
get_prediction("This movie was pretty interesting")

Output of the model: [[0.9999993]]
Predicted sentiment: positive
