## Data preparation

In [2]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


['README', 'test', 'imdb.vocab', 'train', 'imdbEr.txt']

In [3]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['unsup',
 'labeledBow.feat',
 'urls_pos.txt',
 'neg',
 'pos',
 'urls_unsup.txt',
 'urls_neg.txt',
 'unsupBow.feat']

In [4]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [5]:
batch_size = 1024
seed = 12345
train_ds = tf.keras.utils.text_dataset_from_directory(
                            'aclImdb/train', batch_size=batch_size, 
                            validation_split=0.2,
                            subset='training', seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(
                            'aclImdb/train', batch_size=batch_size, 
                            validation_split=0.2,
                            subset='validation', seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [6]:
vocab_size   = 20000
sequence_len = 200

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, f"[{re.escape(string.punctuation)}]", ""
    )

vectorization = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_len,
)

vectorization.adapt(train_ds.map(lambda text, label: text))

In [7]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorization(text), label

train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)

In [8]:
for text_batch, label_batch in train_ds:
    print(label_batch[0].numpy())
    print(text_batch.numpy()[0])
    break

1
[ 2340  4688   447     1     2     1  7756  4688   674     1     2     1
   167     1     5  5953     3  2730  2050  3250    14 16623    14  7824
 17408   510  2886     5 16708     1   967 15850 10635    31     2  1413
     5    29     1 15602  2177   293    33 12974  3768     5   656    35
     2  1947  3275  1753  1719     6   938    11  7175   111   619     6
    65 14406     2  3958  9106     5     2 10635    24  1264   203  1305
     8  4970  7573  3367     2   676     5   253   230     6  7948   154
 17408   379  1440   350     5  1041 16824     3  5459  1619   450     2
   278  2482    16  4863  3678   982  2289     3  6520  9790   742  3863
   881   300     6     2  7856 15594     5   619   846     3 11474   230
  5838     5     1 18551   450     2  1988  9394     2  1674     5     2
   223     6    29    85  5080 15602 17921    30     1    20    23   904
    35     2   295  9632     5  3604  9614  3795     8  1831  2411     6
   467     1  1152    16    65 18858     0     0 

In [9]:
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)

## Training

In [10]:
from tensorflow.keras import layers
from tensorflow import keras

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [11]:
# Two seperate embedding layers, one for tokens, one for token index (positions)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [12]:
embed_dim = 128  # Embedding size for each token
num_heads = 6    # Number of attention heads
ff_dim = 128     # Hidden layer size in feed forward network inside transformer

embedding_layer = TokenAndPositionEmbedding(sequence_len, vocab_size, embed_dim)
transformer_block1 = TransformerBlock(embed_dim, num_heads, ff_dim)
transformer_block2 = TransformerBlock(embed_dim, num_heads, ff_dim)

inputs = layers.Input(shape=(sequence_len,))
x = embedding_layer(inputs)
x = transformer_block1(x)
x = transformer_block2(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [13]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(train_ds, batch_size=32, epochs=20, validation_data=val_ds)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# Pre-trained BERT

In [3]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [4]:
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
os.listdir(dataset_dir)

train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [5]:
batch_size = 1
seed = 12345
train_ds = tf.keras.utils.text_dataset_from_directory(
                            'aclImdb/train', batch_size=batch_size, 
                            validation_split=0.2,
                            subset='training', seed=seed)
val_ds = tf.keras.utils.text_dataset_from_directory(
                            'aclImdb/train', batch_size=batch_size, 
                            validation_split=0.2,
                            subset='validation', seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [6]:
for text_batch, label_batch in train_ds:
    print(label_batch[0].numpy())
    print(text_batch[0].numpy().decode())
    break

1
When we talk Hollywood Hotel we could be talking about one of three things, the actual hotel, the radio program, and this film which was partially inspired by the first two. Dick Powell was the host of the Hollywood Hotel program on CBS radio network in which Louella Parsons dished out the weekly scoop on the stars.<br /><br />Powell and Parsons debuted the Hollywood Hotel program in 1934 so by 1937 it had its fair share of the radio audience. Powell hosted, sang, and kibitzed with Louella and her movie star guests. With the power she had with her column, she was able to get the various stars to go on and plug their latest films for nothing.<br /><br />Then the American Federation of Radio Artists stepped in and demanded she pay wages accordingly and they won the case. That ended the Hollywood Hotel program in 1938. Of course both Powell and Louella went on to other radio venues. The whole story is covered in the Tony Thomas book, The Films Of Dick Powell.<br /><br />But before the p

In [7]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m96.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [8]:
from transformers import BertTokenizer, TFBertForSequenceClassification

def convert_text_to_feature(review, tokenizer, max_length):  
    return tokenizer.encode_plus(review,
                                add_special_tokens=True,
                                max_length = max_length,
                                padding='max_length',
                                truncation=True,
                                return_attention_mask=True)

def map_feature_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {"input_ids": input_ids,
            "token_type_ids": token_type_ids,
            "attention_mask": attention_masks,}, label

def encode_text(ds, tokenizer, max_length):
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
        
    for review, label in ds:
        bert_input = convert_text_to_feature(review[0].numpy().decode(), tokenizer, max_length)
    
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label[0].numpy()])

    return tf.data.Dataset.from_tensor_slices(
                (input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_feature_to_dict)

In [14]:
max_length = 512
learning_rate = 2e-5
epochs = 5

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

In [10]:
tokenizer.encode_plus(text_batch[0].numpy().decode(),
                      add_special_tokens = True,
                      max_length = max_length,
                      padding='max_length',
                      truncation=True,
                      return_attention_mask = True)

{'input_ids': [101, 2043, 2057, 2831, 5365, 3309, 2057, 2071, 2022, 3331, 2055, 2028, 1997, 2093, 2477, 1010, 1996, 5025, 3309, 1010, 1996, 2557, 2565, 1010, 1998, 2023, 2143, 2029, 2001, 6822, 4427, 2011, 1996, 2034, 2048, 1012, 5980, 8997, 2001, 1996, 3677, 1997, 1996, 5365, 3309, 2565, 2006, 6568, 2557, 2897, 1999, 2029, 10223, 8411, 13505, 9841, 2098, 2041, 1996, 4882, 23348, 2006, 1996, 3340, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 8997, 1998, 13505, 6006, 1996, 5365, 3309, 2565, 1999, 4579, 2061, 2011, 4347, 2009, 2018, 2049, 4189, 3745, 1997, 1996, 2557, 4378, 1012, 8997, 4354, 1010, 6369, 1010, 1998, 11382, 16313, 5422, 2007, 10223, 8411, 1998, 2014, 3185, 2732, 6368, 1012, 2007, 1996, 2373, 2016, 2018, 2007, 2014, 5930, 1010, 2016, 2001, 2583, 2000, 2131, 1996, 2536, 3340, 2000, 2175, 2006, 1998, 13354, 2037, 6745, 3152, 2005, 2498, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2059, 1996, 2137, 4657, 1997, 2557, 3324, 3706, 1999, 1998, 6303, 2016, 3477, 

In [11]:
batch_size = 8

train_ds = encode_text(train_ds, tokenizer, max_length).shuffle(32).batch(batch_size)
val_ds = encode_text(val_ds, tokenizer, max_length).batch(batch_size)

In [12]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
model.summary()

Downloading tf_model.h5:   0%|          | 0.00/536M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08), 
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]
)

history = model.fit(train_ds, epochs=epochs, validation_data=val_ds)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5