<a href="https://colab.research.google.com/github/mathluva/Udemy-BERT/blob/main/Udemy_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import dependencies
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
import random

from google.colab import drive

In [2]:
#use ! for terminal commands
!pip install bert-for-tf2 #tensorflow2 light version
!pip install sentencepiece #required for BERT-tf2

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/a5/a1/acb891630749c56901e770a34d6bac8a509a367dd74a05daf7306952e910/bert-for-tf2-0.14.9.tar.gz (41kB)
[K     |████████                        | 10kB 21.3MB/s eta 0:00:01[K     |████████████████                | 20kB 27.3MB/s eta 0:00:01[K     |███████████████████████▉        | 30kB 20.2MB/s eta 0:00:01[K     |███████████████████████████████▉| 40kB 23.5MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 6.6MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/aa/e0/4f663d8abf83c8084b75b995bd2ab3a9512ebc5b97206fde38cef906ab07/py-params-0.10.2.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... 

In [3]:
try:
    %tensorflow_version 2.x #only available in Google colab
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub #used to import the weights from BERT

from tensorflow.keras import layers
import bert #installed in previous step

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `2.x #only available in Google colab`. This will be interpreted as: `2.x`.


TensorFlow 2.x selected.


# Data preprocessing


In [4]:
#load files, data preprocessing
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
#label columns
#latin1 is common for western languages
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    "/content/drive/MyDrive/trainingandtestdata.zip (Unzipped Files)/training.1600000.processed.noemoticon.csv", 
    header = None,
    names = cols,
    engine = "python",
    encoding = "latin1")

In [6]:
#axis1 column data
#without inplace=True, it would be required to write data = data.drop("...")
data.drop(["id", "date","query", "user"], axis = 1, inplace = True)

In [7]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [8]:
#cleaning
#r is regEX (regexr.com for more documentation)
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text() #tweets are not usuable as standard string, need BS to extract string
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ',tweet)#anything behind @symbol with empty space, apply to tweet
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)#? means the s can be there or not
    tweet = re.sub(r"[^a-zA-Z.!?]", ' ', tweet) #keep only standard characters
    tweet = re.sub(r" +", ' ', tweet) #replace multiple sequences of white space with only one white space
    return tweet

In [9]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [10]:
#process sentiment
data_labels = data.sentiment.values
data_labels[data_labels ==4] =1 #data is using 0 and 4, replace 4 with standard 1

# Tokenization


In [11]:
#create BERT layer to have access to metadata for the tokenizer(like vocab size).
#call BERT as a layer, hub is where all pretrained models are located
#trainable = False bc we are not fine-tuning the weights
FullTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable = False) 
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() #way to have acces to vocab
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [12]:
def encode_sentence(sent):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))

In [13]:
data_inputs = [encode_sentence(sentence) for sentence in data_clean]

# Dataset Creation

In [14]:
#create padding for each batches versus entire corpus, add token at end of each sentence
#all input to have the same length in each batch
#shuffle bc data the tweets are organized by sentiment

data_with_len = [[sent, data_labels[i] ,len(sent)] 
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len) 

#lamba, anonymous function
#data sorted by length
data_with_len.sort(key=lambda x: x[2]) 

#keep only longer sentences bc shorter sentences may not 
#convey a lot a meaning after data cleanup
sorted_all = [(sent_lab[0], sent_lab[1])
                    for sent_lab in data_with_len if sent_lab[2]>7] 

In [15]:
#need from_generator bc sentences are different lengths
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all, 
                                                output_types= (tf.int32, tf.int32))

In [16]:
#print first element
next(iter(all_dataset))

(<tf.Tensor: shape=(8,), dtype=int32, numpy=array([2204, 2021, 2025, 2004, 2204, 2004, 1045, 5113], dtype=int32)>,
 <tf.Tensor: shape=(), dtype=int32, numpy=0>)

In [17]:
#  padding
BATCH_SIZE =32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None,  ), ()))

In [18]:
next(iter(all_batched))

(<tf.Tensor: shape=(32, 8), dtype=int32, numpy=
 array([[ 2204,  2021,  2025,  2004,  2204,  2004,  1045,  5113],
        [ 2128, 10474,  2041,  4270,  1012, 26968,  3383,  1029],
        [ 4931,  2158,  1042,  1057,   999,  1045,  5223,  2023],
        [ 4394,  2032,  2748,  1045,  2113,  2017,  2170,  2009],
        [ 2017,  2288,  1996,  9573,  2365,  1012,  1012,  1012],
        [ 2012,  1996,  3448, 11784,  2782,  2007,  2026,  3057],
        [ 3666,  2694, 12954,  2904,  1996,  9212, 20554,  2063],
        [ 2204,  2851, 13871, 13871, 13871, 13871, 13871,  2111],
        [ 2045,  2003,  2498,  2000,  2079,  2012,  2188,  1012],
        [ 2009,  1055,  1996,  9003,  3179,  1997,  5291,  6386],
        [ 3407, 10680,  2154,   999,   999,   999,   999,   999],
        [ 2748,  5003,  2572,  2009,  1055, 19461,  2051,   999],
        [ 2103,  2053,  2293,  2852,  3980,  3892,  1029,   999],
        [ 2003,  3241,  2002,  4152,  2000,  2543,  2619,  2651],
        [13442,  1045,  3335

In [19]:
#get number of batches
NB_BATCHES = math.ceil(len(sorted_all)/BATCH_SIZE)
NB_BATCHES_TEST = NB_BATCHES//10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

In [20]:
class DCNN(tf.keras.Model):

    def __init__(self, 
                 vocab_size,
                 emb_dim = 128, 
                 nb_filters = 50,
                 FFN_units = 512,
                 nb_classes = 2, 
                 dropout_rate = 0.1,
                 training = False,
                 name = 'dcnn'):
        
        super(DCNN, self).__init__(name = name)

        #creating layers used in the model, embedded
        self.embedding = layers.Embedding(vocab_size, emb_dim)

        self.bigram = layers.Conv1D(filters = nb_filters, 
                                               kernel_size =2, 
                                                padding = 'valid', 
                                                activation = 'relu')
        
        self.trigram = layers.Conv1D(filters = nb_filters, 
                                               kernel_size =3, 
                                                padding = 'valid', 
                                                activation = 'relu')

        self.fourgram = layers.Conv1D(filters = nb_filters, 
                                               kernel_size =4, 
                                                padding = 'valid', 
                                                activation = 'relu')
        
        self.pool = layers.GlobalMaxPooling1D()

        self.dense_1= layers.Dense(units = FFN_units,
                                    activation = 'relu')
        
        self.dropout = layers.Dropout(rate = dropout_rate)
        if nb_classes:
            self.last_dense = layers.Dense(units = 1, activation = 'sigmoid')

        else: 
            self.last_dense = layers.Dense(units = nb_classes, activation = 'softmax')

    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.bigram(x)
        x_2 = self.pool(x_2) 
        x_3 = self.bigram(x)
        x_3 = self.pool(x_3) #(batch_size, nb_filters)

        merged = tf.concat([x_1, x_2, x_3], axis =-1)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)

        return output

# Training


In [21]:
VOCAB_SIZE =    len(tokenizer.vocab)
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [22]:
Dcnn = DCNN(vocab_size = VOCAB_SIZE,
                    emb_dim =   EMB_DIM,
            nb_filters = NB_FILTERS, 
            FFN_units = FFN_UNITS,
            nb_classes = NB_CLASSES,
            dropout_rate = DROPOUT_RATE)
            

In [23]:
if NB_CLASSES ==2:
    Dcnn.compile(loss = 'binary_crossentropy',
                    optimizer = 'adam',
                 metrics = ['accuracy'])

else:
    Dcnn.compile(loss = "sparse_categorical_crossentropy",
                        optimizer = 'adam',
                  metrics = ['sparse_categorical_crossentropy'])

In [24]:
checkpoint_path = "/content/drive/MyDrive/Udemy"

ckpt = tf.train.Checkpoint(Dcnn = Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep =1)

if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print('Latest checkpoint has been restored')

In [25]:
#custom callback, execute during training
class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs = None):
        ckpt_manager.save()
        print("Checkpoint saved at {}.".format(checkpoint_path))

In [26]:
Dcnn.fit(train_dataset,
         epochs = NB_EPOCHS,
         callbacks = [MyCustomCallback()])

Epoch 1/5
Checkpoint saved at /content/drive/MyDrive/Udemy.
Epoch 2/5
Checkpoint saved at /content/drive/MyDrive/Udemy.
Epoch 3/5
Checkpoint saved at /content/drive/MyDrive/Udemy.
Epoch 4/5
Checkpoint saved at /content/drive/MyDrive/Udemy.
Epoch 5/5
Checkpoint saved at /content/drive/MyDrive/Udemy.


<tensorflow.python.keras.callbacks.History at 0x7f6a6f532f10>

Evaluation


In [27]:
results = Dcnn.evaluate(test_dataset)
print(results)

[0.41371220350265503, 0.8236710429191589]


In [28]:
#training accuracy .8718
#testing accuracy .8237

def get_prediction(sentence):
    tokens = encode_sentence(sentence)
    inputs = tf.expand_dims(tokens, 0)
    output = Dcnn(inputs, training = False)
    sentiment = math.floor(output*2)
    if sentiment==0:
        print("Output of model: {} \nPredicted sentiment: negative.".format(output))
    elif sentiment==1:
        print("Output of model: {} \nPredicted sentiment: positive.".format(output))

In [29]:
get_prediction("I love this course.")

Output of model: [[0.8760564]] 
Predicted sentiment: positive.


In [30]:
get_prediction("The training takes entirely too long!")

Output of model: [[0.11303671]] 
Predicted sentiment: negative.
