### Importing required packages

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os, pathlib, shutil, random

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization, Embedding, Dense
from tensorflow.keras.utils import text_dataset_from_directory

from sklearn.decomposition import PCA

## TextVectorization

### Dummy dataset and a test sentence


In [None]:
dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms.",
]

test_sentence = "I write, rewrite, and still rewrite again"

### TextVectorization layer and adapt to dummy dataset

Demonstrate the working of a TextVectorization layer.

In [None]:
# Instantiating a TextVectorization layer/object with output mode as integer
text_vectorization = TextVectorization(
    output_mode="int",              # int is default. There are different kinds of modes available
    max_tokens=15,                  # Vocabulary size
    output_sequence_length=10,      # Maximum length of output sequence
    # We can use custom functions also for standardizing and splitting the text - see the Book by Chollet
    # standardize=custom_standardization_fn,
    # split=custom_split_fn,
)

# Adapt to data
text_vectorization.adapt(dataset)      # Computes a vocabulary of string terms from tokens in a dataset


In [None]:
# To see the working of TextVectorization
vocabulary = text_vectorization.get_vocabulary()
print(f"vocabulary = {vocabulary}")
print(f"len(vocabulary) = {len(vocabulary)}")

vocabulary = ['', '[UNK]', 'erase', 'write', 'then', 'rewrite', 'poppy', 'i', 'blooms', 'and', 'again', 'a']
len(vocabulary) = 12


In [None]:
# To see how the the text_vec layer transforms/vectorizes the raw text
encoded_sentence = text_vectorization(test_sentence)
print(f"encoded sentence = {encoded_sentence}")
print(f"len(encoded sentence) = {len(encoded_sentence)}")
# print(f"encoded dataset_t = {text_vectorization(dataset_t)}")

encoded sentence = [ 7  3  5  9  1  5 10  0  0  0]
len(encoded sentence) = 10


In [None]:
# decode back for comparison with test_sentence
inverse_vocab = dict(enumerate(vocabulary)) # making a dictionary to decode embeddings
print(f"inverse_vocab = {inverse_vocab}")
decoded_sentence = " ".join(inverse_vocab[int(i)] for i in encoded_sentence)
print(f"decoded sentence = {decoded_sentence}")

inverse_vocab = {0: '', 1: '[UNK]', 2: 'erase', 3: 'write', 4: 'then', 5: 'rewrite', 6: 'poppy', 7: 'i', 8: 'blooms', 9: 'and', 10: 'again', 11: 'a'}
decoded sentence = i write rewrite and [UNK] rewrite again   


In [None]:
print(f"test_sentence = {test_sentence}")

test_sentence = I write, rewrite, and still rewrite again


## Processing the dataset using TextVectorization layer of keras

In [None]:
# List subdirectories
!cd aclImdb && ls -d */

test/  train/


In [None]:
# Remove unnecessary folder
!rm -r aclImdb/train/unsup

In [None]:
# Visualise a sample
!cat aclImdb/train/pos/4077_10.txt

I first saw this back in the early 90s on UK TV, i did like it then but i missed the chance to tape it, many years passed but the film always stuck with me and i lost hope of seeing it TV again, the main thing that stuck with me was the end, the hole castle part really touched me, its easy to watch, has a great story, great music, the list goes on and on, its OK me saying how good it is but everyone will take there own best bits away with them once they have seen it, yes the animation is top notch and beautiful to watch, it does show its age in a very few parts but that has now become part of it beauty, i am so glad it has came out on DVD as it is one of my top 10 films of all time. Buy it or rent it just see it, best viewing is at night alone with drink and food in reach so you don't have to stop the film.<br /><br />Enjoy

### Create a validation directory and move 20% of the train data to it

In [None]:
# move 20% of the training data to the validation folder
base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"
for category in ("neg", "pos"):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    # random.Random(1337).shuffle(files) # We should shuffle. Only commenting for demonstration
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname,
                    val_dir / category / fname)

### Create batches of data using `text_dataset_from_directory`

In [None]:
# Create dataset using utility
batch_size = 32

train_ds = text_dataset_from_directory("aclImdb/train", batch_size=batch_size)

val_ds = text_dataset_from_directory("aclImdb/val", batch_size=batch_size)

test_ds = text_dataset_from_directory("aclImdb/test", batch_size=batch_size)

# Extracting only the review text(not labels); to be used later to adapt the TextVec layer
text_only_train_ds = train_ds.map(lambda x, y: x)             # lambda x, y: x  --> replace x,y with x. That is remove labels, just keep text data.


Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


There are 20000, 5000, and 25000 records in train, validation, and test directories with two class as positive and negative.

In [None]:
# Check shapes

for inputs, targets in train_ds:

    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)

    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)

    print("inputs[2]:", inputs[2])
    print("targets[2]:", targets[2])

    break

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[2]: tf.Tensor(b'*SPOILERS INCLUDED*<br /><br />With a title like "Bleed", you know the creative juices weren\'t running on high when this puppy was conceived. The movie is your basic run-of-the-mill low-budget slasher movie. Oh sure, it tries to be creative with the premise of the "murder club", but we learn that was just a joke anyways. Okay, for those who really care about these things, the basic plot is that new girl in town starts dating her co-worker. He invites her into his circle of friends, and at a party, they tell her how they have a "murder club" and they murder people, blah blah blah. Well, we learn that it was all a joke, but not before our heroine kills a lady in a parking garage. Now, the "members" of the Murder Club are being killed one by one. Oh, and the bad guys wins and the movie ends on a downer. By that time, you won\'t really care though.<br /><br />In 

### Create TextVectorization layer and adapt to dataset

In [None]:
# Vectorizing the data
max_length = 600
max_tokens = 20000
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
    )

text_vectorization.adapt(text_only_train_ds)


# Apply TextVec to train, val, test set

int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), tf.reshape(y, (-1,1))),
                            num_parallel_calls=4)

int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), tf.reshape(y, (-1,1))),
                        num_parallel_calls=4)

int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), tf.reshape(y, (-1,1))),
                          num_parallel_calls=4)


### Visualize and compare the raw and processed data

In [None]:
# Let's visualize the raw text and the vectorized (to int) text
for text, label in train_ds:
  print(text[0])
  print(label[0])
  break

for int_of_text, label in int_train_ds:
  print(int_of_text[0])
  print(label[0])
  break


Vector representation of the word 'movie'

In [None]:
text_vectorization("movie")

Vector representation of "great movie" and "a fine story"

In [None]:
text_vectorization(["great movie", "a fine story"])

### NN architecture with a TextVectorization layer, an Embedding layer, and Dense layers

In [None]:
max_tokens = 20000
inputs = keras.Input(shape=(1,), dtype=tf.string)           # shape=(None,), dtype="int64"

# The Text Vectoritation layer
txt_vec_out = text_vectorization(inputs)             # Note that this TextVec layer is already apadted on the train dataset

# The Embedding layer
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256, name='embedding')(txt_vec_out)    # the largest integer (i.e. word index) in the input

x = layers.Dense(64, activation="relu")(embedded)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(16, activation="relu")(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

model.summary()

In [None]:
# Get the embedding layer
embedding_layer = model.get_layer('embedding')

# Get the embeddings
embeddings = embedding_layer.get_weights()[0]
embeddings.shape

In [None]:
# Get the vocabulary from the TextVectorization layer
vocab = text_vectorization.get_vocabulary()
len(vocab)

In [None]:
# Sample words to visualize word embeddings for
test_words = ['good', 'bad', 'nice', 'poor', 'terrible', 'terrific', 'awesome', 'awful', 'best', 'worst']

print(f"{'Word':<15} {'Index'}")
print("="*30)
for word in test_words:
    print(f"{word:<15} {vocab.index(word)}")

In [None]:
# Embedding dimension
embeddings[vocab.index('good')].shape

In [None]:
from sklearn.decomposition import PCA

# Create a 2-dimensional PCA model of the word vectors using the scikit-learn PCA class
# n_components in PCA specifies the no. of dimensions
pca = PCA(n_components=2, random_state=42)

# Fit and transform the vectors using PCA model
reduced_untrained_emb = pca.fit_transform(embeddings)
reduced_untrained_emb.shape

In [None]:
# Reduced embedding for word 'good'
reduced_untrained_emb[vocab.index('good')]

In [None]:
# Visualize the embeddings
plt.figure(figsize=(8, 6))
for word in test_words:
    if word != '':  # Skip the empty string token
        x, y = reduced_untrained_emb[vocab.index(word)]
        plt.scatter(x, y)
        plt.annotate(word, (x, y), xytext=(5, 2), textcoords='offset points')

plt.title("Word Embeddings Visualization (Before training)")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.tight_layout()
plt.show()

In [None]:
# Fit the model on train set
callbacks = [keras.callbacks.ModelCheckpoint("one_hot_dense.keras", save_best_only=True)]

# Change target shape from (None,) to (None, 1)
train_dataset = train_ds.map(lambda x, y: (x, tf.reshape(y, (-1,1))))
val_dataset = val_ds.map(lambda x, y: (x, tf.reshape(y, (-1,1))))

model.fit(train_dataset,
          validation_data = val_dataset,
          epochs = 20,
          callbacks = callbacks)

In [None]:
## Load saved model
# model = keras.models.load_model("one_hot_dense.keras")

# Check model performance
test_dataset = test_ds.map(lambda x, y: (x, tf.reshape(y, (-1,1))))
print(f"Test acc: {model.evaluate(test_dataset)[1]:.3f}")

In [None]:
# Get the embedding layer
trained_embedding_layer = model.get_layer('embedding')

# Get the embeddings
trained_embeddings = trained_embedding_layer.get_weights()[0]
trained_embeddings.shape

In [None]:
# Create a 2-dimensional PCA model of the word vectors using the scikit-learn PCA class
# n_components in PCA specifies the no.of dimensions
pca = PCA(n_components=2, random_state=42)

# Fit and transform the vectors using PCA model
reduced_trained_emb = pca.fit_transform(trained_embeddings)
reduced_trained_emb.shape

In [None]:
# Visualize the embeddings
plt.figure(figsize=(10, 8))
for word in test_words:
    if word != '':  # Skip the empty string token
        x, y = reduced_trained_emb[vocab.index(word)]
        plt.scatter(x, y)
        plt.annotate(word, (x, y), xytext=(5, 2), textcoords='offset points')

plt.title("Word Embeddings Visualization (After training)")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.tight_layout()
plt.show()