<a href="https://colab.research.google.com/github/ordovas/test_and_learn/blob/main/Bert_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stage 1: Importing dependencies

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

#from google.colab import drive

In [2]:
#!pip install bert-for-tf2
#!pip install sentencepiece

In [3]:
!pip install --upgrade tensorflow-hub
!pip install --upgrade tensorflow-estimator==2.1.0

Requirement already up-to-date: tensorflow-hub in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (0.12.0)
Requirement already up-to-date: tensorflow-estimator==2.1.0 in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (2.1.0)


In [4]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf



In [5]:

import tensorflow_hub as hub

In [6]:

from tensorflow.keras import layers
import bert

# Stage 2: Data preprocessing

## Loading files

We import files from our personal Google drive.

In [7]:
#drive.mount("/content/drive")

In [8]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    "training.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

In [9]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [10]:
data.head(5)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## Preprocessing

### Cleaning

In [11]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Delete the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Delete URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Just keep letters and important punctuation
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Remove additional spaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [12]:
!pip install lxml



In [14]:
len(data.text)

1600000

In [18]:
data_clean = [ ]
for i,tweet in enumerate(data.text):
    data_clean.append(clean_tweet(tweet))
    if i%5000==0:
        print(i,"out of 1600000 - ", round(100*(i/1600000),2),"%")

0 out of 1600000 -  0.0 %
5000 out of 1600000 -  0.31 %
10000 out of 1600000 -  0.62 %
15000 out of 1600000 -  0.94 %
20000 out of 1600000 -  1.25 %
25000 out of 1600000 -  1.56 %
30000 out of 1600000 -  1.88 %
35000 out of 1600000 -  2.19 %
40000 out of 1600000 -  2.5 %
45000 out of 1600000 -  2.81 %
50000 out of 1600000 -  3.12 %
55000 out of 1600000 -  3.44 %
60000 out of 1600000 -  3.75 %
65000 out of 1600000 -  4.06 %
70000 out of 1600000 -  4.38 %
75000 out of 1600000 -  4.69 %
80000 out of 1600000 -  5.0 %
85000 out of 1600000 -  5.31 %
90000 out of 1600000 -  5.62 %
95000 out of 1600000 -  5.94 %
100000 out of 1600000 -  6.25 %
105000 out of 1600000 -  6.56 %
110000 out of 1600000 -  6.88 %
115000 out of 1600000 -  7.19 %
120000 out of 1600000 -  7.5 %
125000 out of 1600000 -  7.81 %
130000 out of 1600000 -  8.12 %
135000 out of 1600000 -  8.44 %
140000 out of 1600000 -  8.75 %
145000 out of 1600000 -  9.06 %
150000 out of 1600000 -  9.38 %
155000 out of 1600000 -  9.69 %
16000

In [19]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization

We need to create a BERT layer to have access to meta data for the tokenizer (like vocab size).

In [20]:
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [21]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [22]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

### Dataset creation

We will create padded batches (so we pad sentences for each batch inpedendently), this way we add the minimum of padding tokens possible. For that, we sort sentences by length, apply padded_batches and then shuffle.

In [23]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)
data_with_len.sort(key=lambda x: x[2])
sorted_all = [(sent_lab[0], sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 7]  #in vid 18 it uses >2

In [24]:
# A list is a type of iterator so it can be used as generator for a dataset
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [57]:
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=
 array([ 1045,  4299,  2009,  2347,  1005,  1056, 16373,  1012],
       dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=0>)

In [26]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()) )

In [27]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 1045,  4299,  2009,  2347,  1005,  1056, 16373,  1012],
        [10364,  6861,  2035,  2058,  2010,  9019,  1012, 26316],
        [ 2040,  1029,  2129,  1029,  2009,  2035,  2047,   999],
        [ 2205,  2919,  2057,  2031,  2000,  3524,  2019,  3178],
        [ 6842,  2003, 15729,  2033,  2039,  2023,  2851,  1012],
        [ 1045,  1005,  1049,  2182,  2049,  2980,  2004,  3109],
        [ 1045,  4060,  2026,  4451,  2205,  2172,  1012,  1012],
        [ 3398,  2074,  2318,  1996,  8065,  2023,  2095,  1012],
        [10506,  5292,  3270,  3270,  3270,  1045,  2097,  2085],
        [ 1048,  3398,  2034,  2048,  2086,  2053, 23605,  2395],
        [ 2293,  2035,  1996,  8403,  2111,  2012,  2147,   999],
        [ 2178,  5353,  2985,  2012,  2188,  2894,  4718,  2140],
        [12392,  2906,  1045,  2572,  7653,  1012,  4067,  2017],
        [ 1045,  2018,  1996,  5409,  3637,  2412,   999,   999],
        [ 1045,  2633,  2288

In [28]:
NB_BATCHES = math.ceil(len(sorted_all) / BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

In [58]:
NB_BATCHES, NB_BATCHES_TEST

(41328, 4132)

TypeError: 'PaddedBatchDataset' object is not subscriptable

# Stage 3: Model building

In [29]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x) # (batch_size, nb_filters, seq_len-1)
        x_1 = self.pool(x_1) # (batch_size, nb_filters)
        x_2 = self.trigram(x) # (batch_size, nb_filters, seq_len-2)
        x_2 = self.pool(x_2) # (batch_size, nb_filters)
        x_3 = self.fourgram(x) # (batch_size, nb_filters, seq_len-3)
        x_3 = self.pool(x_3) # (batch_size, nb_filters)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        
        return output

# Stage 4: Training

In [30]:
VOCAB_SIZE = len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [31]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [32]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [33]:
checkpoint_path = "./drive/MyDrive/projects/BERT/ckpt_bert_tok/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest Checkpoint restored!")

In [34]:
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [35]:
Dcnn.fit(train_dataset,
         epochs=NB_EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
  37196/Unknown - 2172s 58ms/step - loss: 0.4297 - accuracy: 0.8021Checkpoint saved at ./drive/MyDrive/projects/BERT/ckpt_bert_tok/.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f0e4a8044a8>

# Stage 5: Evaluation

In [36]:
results = Dcnn.evaluate(test_dataset)
print(results)

   4132/Unknown - 37s 9ms/step - loss: 0.4357 - accuracy: 0.8340[0.4356998347344611, 0.83400136]


In [40]:
def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens, 0)
    print(inputs)

    output = Dcnn(inputs, training=False)
    print(output)

    sentiment = math.floor(output*2)

    if sentiment == 0:
        print("Output of the model: {}\nPredicted sentiment: negative.".format(
            output))
    elif sentiment == 1:
        print("Output of the model: {}\nPredicted sentiment: positive.".format(
            output))

In [41]:
get_prediction("This movie was pretty interesting.")

tf.Tensor([[2023 3185 2001 3492 5875 1012]], shape=(1, 6), dtype=int32)
tf.Tensor([[0.9992982]], shape=(1, 1), dtype=float32)
Output of the model: [[0.9992982]]
Predicted sentiment: positive.


In [44]:
get_prediction("I'd rather not do that again.")

tf.Tensor([[1045 1005 1040 2738 2025 2079 2008 2153 1012]], shape=(1, 9), dtype=int32)
tf.Tensor([[0.1349157]], shape=(1, 1), dtype=float32)
Output of the model: [[0.1349157]]
Predicted sentiment: negative.


In [54]:
text_sample="""
How many roads must a man walk down
Before you call him a man?
How many seas must a white dove sail
Before she sleeps in the sand?
Yes, and how many times must the cannonballs fly
Before they're forever banned?
The answer, my friend, is blowin' in the wind
The answer is blowin' in the wind
"""

get_prediction(text_sample)

tf.Tensor(
[[ 2129  2116  4925  2442  1037  2158  3328  2091  2077  2017  2655  2032
   1037  2158  1029  2129  2116 11915  2442  1037  2317 10855  9498  2077
   2016 25126  1999  1996  5472  1029  2748  1010  1998  2129  2116  2335
   2442  1996  8854 18510  4875  2077  2027  1005  2128  5091  7917  1029
   1996  3437  1010  2026  2767  1010  2003  6271  2378  1005  1999  1996
   3612  1996  3437  2003  6271  2378  1005  1999  1996  3612]], shape=(1, 70), dtype=int32)
tf.Tensor([[0.10389104]], shape=(1, 1), dtype=float32)
Output of the model: [[0.10389104]]
Predicted sentiment: negative.


In [55]:
text_sample="""
Here comes the sun do, do, do
Here comes the sun
And I say it's all right
Little darling, it's been a long cold lonely winter
Little darling, it seems like years since it's been here
Here comes the sun do, do, do
Here comes the sun
And I say it's all right
Little darling, the smiles returning to the faces
Little darling, it feels like years since it's been here
Here comes the sun do, do, do
Here comes the sun
And I say it's all right
Little darling, I feel that ice is slowly melting
Little darling, it seems like years since it's been clear
Here comes the sun do, do, do
Here comes the sun
And I say it's all right
Here comes the sun do, do, do
Here comes the sun
And I say it's all right
"""

get_prediction(text_sample)

tf.Tensor(
[[ 2182  3310  1996  3103  2079  1010  2079  1010  2079  2182  3310  1996
   3103  1998  1045  2360  2009  1005  1055  2035  2157  2210  9548  1010
   2009  1005  1055  2042  1037  2146  3147  9479  3467  2210  9548  1010
   2009  3849  2066  2086  2144  2009  1005  1055  2042  2182  2182  3310
   1996  3103  2079  1010  2079  1010  2079  2182  3310  1996  3103  1998
   1045  2360  2009  1005  1055  2035  2157  2210  9548  1010  1996  8451
   4192  2000  1996  5344  2210  9548  1010  2009  5683  2066  2086  2144
   2009  1005  1055  2042  2182  2182  3310  1996  3103  2079  1010  2079
   1010  2079  2182  3310  1996  3103  1998  1045  2360  2009  1005  1055
   2035  2157  2210  9548  1010  1045  2514  2008  3256  2003  3254 13721
   2210  9548  1010  2009  3849  2066  2086  2144  2009  1005  1055  2042
   3154  2182  3310  1996  3103  2079  1010  2079  1010  2079  2182  3310
   1996  3103  1998  1045  2360  2009  1005  1055  2035  2157  2182  3310
   1996  3103  2079  1010  