## Part 3: Sentiment Analysis
IMDB review dataset will be downloaded. We will classify if a review is positive or negative or not.

## Download GloVe word vectors: LOAD THIS PART 
Other options are:  
glove.6B.zip: from Wikipedia + Gigaword, 6B tokens, 400K vocab, uncased, 50d, 100d, 200d and 300d vectors, 822MB download  
glove.42B.300d.zip: from Common Crawl, 42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download  
glove.840B.300d.zip: from Common Crawl, 840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download  
glove.twitter.27B.zip: from Twitter, 2B tweets, 27B tokens, 1.2M vocab, uncased, 25d, 50d, 100d, & 200d vectors, 1.42 GB download  

In [None]:
%%capture
%%bash
wget http://nlp.stanford.edu/data/glove.6B.zip
unzip -q glove.6B.zip

Might took a while to download.

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import os

In [None]:
path_to_glove_file = os.path.join(
    os.path.expanduser("~"), "/content/glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
print(list(embeddings_index.items())[0])

In [None]:
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

In [None]:
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples

In [None]:
train_size, test_size

## Preprocess data

In [None]:
train_samples = []
train_labels = []
for x_batch, y_batch in datasets["train"].batch(64):
    for review, label in zip(x_batch.numpy(), y_batch.numpy()):
        train_samples.append(review.decode("utf-8"))
        train_labels.append(label)

In [None]:
test_samples = []
test_labels = []
for x_batch, y_batch in datasets["test"].batch(64):
    for review, label in zip(x_batch.numpy(), y_batch.numpy()):
        test_samples.append(review.decode("utf-8"))
        test_labels.append(label)

In [None]:
len(train_samples), len(test_samples)

In [None]:
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    return clean_special_chars(data, punct)

In [None]:
processed_train_samples = [preprocess(x) for x in train_samples]
processed_test_samples = [preprocess(x) for x in test_samples]
processed_train_samples[0]

In [None]:
# Split data intro valid and train
from sklearn.model_selection import train_test_split

train_samples, val_samples, train_labels, val_labels = train_test_split(processed_train_samples, train_labels,
                                                                        test_size=0.2, random_state=42, shuffle=True)

In [None]:
# Count word frequencies
num_words = [len(x.split()) for x in processed_train_samples]
print('The total number of samples is', len(processed_train_samples))
print('The total number of words in the files is', sum(num_words))
print('The average number of words in the files is', sum(num_words)/len(num_words))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(num_words, bins="auto")
plt.xlabel('Num of words in sentences')
plt.ylabel('Frequency')
plt.show()

In [None]:
"""
TODO 1: Create Vocabulary index with TextVectorization
"""

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vocab_size = 20000
max_sentence_length = 700
batch_size = 64
vectorizer = TextVectorization(max_tokens=vocab_size, output_sequence_length=max_sentence_length)
text_dataset = tf.data.Dataset.from_tensor_slices(processed_train_samples).batch(batch_size)
vectorizer.adapt(text_dataset)

In [None]:
vectorizer.get_vocabulary()[:10]

In [None]:
voc = vectorizer.get_vocabulary()
print(len(voc))
word_index = dict(zip(voc, range(2, len(voc))))
print(list(word_index.items())[:10])

In [None]:
sample_output = vectorizer(np.array([["I am about to generate fake Shakespearian text!"]]))
sample_output.numpy()[0, :10]

In [None]:
# Convert into Numpy array
train_samples = np.asarray(train_samples)
train_labels = np.asarray(train_labels)
val_samples = np.asarray(val_samples)
val_labels = np.asarray(val_labels)
test_samples = np.asarray(test_samples)
test_labels = np.asarray(test_labels)

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras import Input

embedding_layer = Embedding(num_tokens, embedding_dim, 
                            embeddings_initializer = tf.keras.initializers.Constant(embedding_matrix),
                            trainable=False)
input_layer = Input(shape=(1,), dtype=tf.string)

In [None]:
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import GRU, Bidirectional, Dense, Embedding
from tensorflow.keras.initializers import Constant

# Sample model 1: normal RNN
def create_model():
    model = Sequential([
        input_layer,
        vectorizer,
         Embedding(num_tokens, embedding_dim, 
                            embeddings_initializer = Constant(embedding_matrix), trainable=False),
        Bidirectional(GRU(256, return_sequences=True)),
        Bidirectional(GRU(128)),
        Dense(128, activation="tanh"),
        Dense(64, activation="tanh"),
        Dense(1, activation="sigmoid")                  
    ])
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

model = create_model()
model.summary()

In [None]:
history = model.fit(train_samples, train_labels, epochs=30, validation_data=(val_samples, val_labels))

In [None]:
# Plot accuracy vs epoch
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='upper left')