<a href="https://colab.research.google.com/github/mpre5ley/BERT-IMDB-Transfer-Learning-Classification/blob/main/BERT%2BIMDB%2BTransfer_Learning%2BClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

In [None]:
imdb_train, ds_info=tfds.load(name="imdb_reviews", split="train", with_info=True, as_supervised=True)
imdb_test=tfds.load(name="imdb_reviews", split="test", as_supervised=True)

In [None]:
ds_info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset. This is a dataset for binary sentiment
    classification containing substantially more data than previous benchmark
    datasets. We provide a set of 25,000 highly polar movie reviews for training,
    and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='/root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
        'text': Text(shape=(), dtype=string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
   

In [None]:
tokenizer = tfds.deprecated.text.Tokenizer()
vocabulary_set = set()
MAX_TOKENS = 0
for example, label in imdb_train:
  some_tokens = tokenizer.tokenize(example.numpy())
  if MAX_TOKENS < len(some_tokens):
            MAX_TOKENS = len(some_tokens)
  vocabulary_set.update(some_tokens)

In [None]:
len(vocabulary_set)

93929

In [None]:
imdb_encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set, lowercase=True,tokenizer=tokenizer)
vocab_size = imdb_encoder.vocab_size
print(vocab_size, MAX_TOKENS)

93931 2525


In [None]:
from tensorflow.keras.preprocessing import sequence
def encode_pad_transform(sample):
    encoded = imdb_encoder.encode(sample.numpy())
    pad = sequence.pad_sequences([encoded], padding='post',
                                 maxlen=150)
    return np.array(pad[0], dtype=np.int64)

In [None]:
def encode_tf_fn(sample, label):
    encoded = tf.py_function(encode_pad_transform,
                             inp=[sample],
                             Tout=(tf.int64))
    encoded.set_shape([None])
    label.set_shape([])
    return encoded, label


In [None]:
encoded_train = imdb_train.map(encode_tf_fn,
                      num_parallel_calls=tf.data.experimental.AUTOTUNE)
encoded_test = imdb_test.map(encode_tf_fn,
                      num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [None]:
encoded_train

<ParallelMapDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.int64, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [None]:
path="/content/drive/MyDrive/Chatbots/glove.6B/glove.6B.50d.txt"

In [None]:
dict_w2v = {}
with open(path, "r") as file:
    for line in file:
      tokens = line.split()
      word = tokens[0]
      vector = np.array(tokens[1:], dtype=np.float32)
      if vector.shape[0] == 50:
          dict_w2v[word] = vector
      else:
            print("There was an issue with " + word)
# let's check the vocabulary size
print("Dictionary Size: ", len(dict_w2v))

FileNotFoundError: ignored

In [None]:
dict_w2v.get("good")

In [None]:
embedding_dim = 50
embedding_matrix = np.zeros((imdb_encoder.vocab_size, embedding_dim))

In [None]:
imdb_encoder.encode("good")[0]

In [None]:
len(imdb_encoder.tokens)

In [None]:
unk_cnt = 0
unk_set = set()
for word in imdb_encoder.tokens:
    embedding_vector = dict_w2v.get(word)
    if embedding_vector is not None:
        tkn_id = imdb_encoder.encode(word)[0]
        embedding_matrix[tkn_id] = embedding_vector
    else:
        unk_cnt += 1
        unk_set.add(word)

In [None]:
print(unk_set)

During the data loading step, we saw that the total number of tokens was 93,931. Out of these, 14,553 words could not be found, which is approximately 15% of the tokens. For these words, the embedding matrix will have zeros. This is the first step in transfer learning. Now that the setup is completed, we will need to use TensorFlow to use these pre-trained embeddings. There will be two different models that will be tried â€“ the first will be based on feature extraction and the second one on fine-tuning.

In [None]:
# Length of the vocabulary in chars
vocab_size = imdb_encoder.vocab_size # len(chars)
# Number of RNN units
rnn_units = 64
#batch size
BATCH_SIZE=100

In [None]:
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense

In [None]:
def build_model_bilstm(vocab_size, embedding_dim,rnn_units, batch_size, train_emb=False):
  model = tf.keras.Sequential([
    Embedding(vocab_size, embedding_dim, mask_zero=True, weights=[embedding_matrix], trainable=train_emb),
    Bidirectional(LSTM(rnn_units, return_sequences=True,dropout=0.5)),
    Bidirectional(LSTM(rnn_units,dropout=0.25)),
    Dense(1, activation="sigmoid")
  ])
  return model

In [None]:
model_fe = build_model_bilstm(
  vocab_size = vocab_size,
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE,
  train_emb=True)

In [None]:
model_fe.summary()

In [None]:
model_fe.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy', 'Precision', 'Recall'])

In [None]:
encoded_train_batched = encoded_train.batch(BATCH_SIZE).prefetch(100)

In [None]:
model_fe.fit(encoded_train_batched, epochs=10)

In [None]:
model_fe.evaluate(encoded_test.batch(BATCH_SIZE))