# Assignment: use RNNs to do Text classification, Text generation, Text summarization and Machine translation

Author: Long M. Luu

## Part 1: text summarization
The News20Group dataset will be downloaded. The job is to create a model that can summarize it.

## Download GloVe word vectors: LOAD THIS PART
Other options are:  
glove.6B.zip: from Wikipedia + Gigaword, 6B tokens, 400K vocab, uncased, 50d, 100d, 200d and 300d vectors, 822MB download  
glove.42B.300d.zip: from Common Crawl, 42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download  
glove.840B.300d.zip: from Common Crawl, 840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download  
glove.twitter.27B.zip: from Twitter, 2B tweets, 27B tokens, 1.2M vocab, uncased, 25d, 50d, 100d, & 200d vectors, 1.42 GB download  

In [None]:
%%capture
%%bash
wget http://nlp.stanford.edu/data/glove.6B.zip
unzip -q glove.6B.zip

Might took a while to download.

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import os

In [None]:
# Extract file to load word embeddings
path_to_glove_file = os.path.join(
    os.path.expanduser("~"), "/content/glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
# Print an example of word embedding
print(list(embeddings_index.items())[0])

In [None]:
# Print an example of word embedding
print(list(embeddings_index.items())[42])

In [None]:
# Get Newsground20 dataset
# It is News from BBC, categoried into 20 categories
data_path = keras.utils.get_file(
    "news20.tar.gz",
    "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz",
    extract=True,
)

In [None]:
import os
import pathlib

# Create directory and move corresponding label to that directory
os.listdir(pathlib.Path(data_path).parent)
data_dir = pathlib.Path(data_path).parent / "20_newsgroup"
dirnames = os.listdir(data_dir)
print("Number of directories:", len(dirnames))
print("Directory names:", dirnames)

fnames = os.listdir(data_dir / "comp.graphics")
print("Number of files in comp.graphics:", len(fnames))
print("Some example filenames:", fnames[:5])

In [None]:
# Read an example
print(open(data_dir / "comp.graphics" / "38987").read())

## Preprocess data

In [None]:
# Delete headers, get class names and indicies
samples = []
labels = []
class_names = []
class_index = 0
for dirname in sorted(os.listdir(data_dir)):
    class_names.append(dirname)
    dirpath = data_dir / dirname
    fnames = os.listdir(dirpath)
    print("Processing %s, %d files found" % (dirname, len(fnames)))
    for fname in fnames:
        fpath = dirpath / fname
        f = open(fpath, encoding="latin-1")
        content = f.read()
        lines = content.split("\n")
        lines = lines[10:]
        content = "\n".join(lines)
        samples.append(content)
        labels.append(class_index)
    class_index += 1

print("Classes:", class_names)
print("Number of samples:", len(samples))

In [None]:
# Read a deleted header sample
print(samples[42], labels[42], class_names[labels[42]])

In [None]:
"""
TODO 1: Preprocess data
For each element in "samples", call "preprocess" function for that element
Append all results in a list called processed_samples
"""

# Preprocess data function
def preprocess(data):
    '''
    Preprocess data: all characters are converted into lowercase and special characters are removed
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution

    Arguments:
        data: a string
    Returns:
        text: preprocessed version of "data"
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    return clean_special_chars(data, punct)

### START CODE HERE
processed_samples = None
### END CODE HERE

In [None]:
# Unprocessed and processed data
print(samples[0], end="\n------------------\n")
print(processed_samples[0])

In [None]:
# Split data intro valid and train
from sklearn.model_selection import train_test_split

train_samples, val_samples, train_labels, val_labels = train_test_split(processed_samples, labels,
                                                                        test_size=0.2, random_state=42, shuffle=True)

In [None]:
# Calculate total number of samples, total number of words in sample, and average number of words in each sample
num_words = [len(x.split()) for x in processed_samples]
print('The total number of samples is', len(processed_samples))
print('The total number of words in the files is', sum(num_words))
print('The average number of words in the files is', sum(num_words)/len(num_words))

In [None]:
# Plot the histogram
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(num_words, bins="auto")
plt.xlabel('Num of words in sentences')
plt.ylabel('Frequency')
plt.show()

Given the Histogram plot, it is reasonable to choose max sentence length = 600

In [None]:
"""
TODO 2: Create vocabulary index with TextVectorization
https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/TextVectorization
Let: max_tokens = vocab_size, output_sequence_length = max_sentence_length

Then, create text_dataset by calling tf.data.Dataset.from_tensor_slices
https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensor_slices
Then call function batch() of "text_dataset", and pass in batch_size

"""

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vocab_size = 20000 # Only take top 20k words of the vocab
max_sentence_length = 400 # Max input length, exceeded words will be padded
batch_size = 64 

### START CODE HERE
vectorizer = None
text_dataset = None
### END CODE HERE

vectorizer.adapt(text_dataset)

In [None]:
vectorizer.get_vocabulary()[:10] # Get 10 examples of vocabulary

Expected output: `['', '[UNK]', 'the', 'to', 'of', 'a', 'and', 'i', 'in', 'is']`

In [None]:
# Sample sentence that is vectorzied
sample_output = vectorizer(np.array([["I am learning text vectorization"]]))
sample_output.numpy()[0, :10]

Expected output: `array([   7,  115, 2888,  660,    1,    0,    0,    0,    0,    0])`

In [None]:
voc = vectorizer.get_vocabulary()
print(len(voc))
word_index = dict(zip(voc, range(2, len(voc))))
print(list(word_index.items())[:10])

Expected output: `20000
[('', 2), ('[UNK]', 3), ('the', 4), ('to', 5), ('of', 6), ('a', 7), ('and', 8), ('i', 9), ('in', 10), ('is', 11)]`

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
epochs=50
model_cp = tf.keras.callbacks.ModelCheckpoint("model_cp", monitor="val_loss", save_format="tf", save_best_only=True)
early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", 
                                              patience=epochs//10, restore_best_weights=True)

## Create a simple RNN model

In [None]:
"""
TODO 3: create a model
The layers of Sequential are as follows:

Input, has shape (1, ), and dtype is tf.string: https://www.tensorflow.org/api_docs/python/tf/keras/Input?hl=en
"vectorizer" variable (TextVectorization layer defined above)
Embedding layer: input_dim is "num_tokens", output dim is "embedding_dim", embeddings_initializer is Contstant(embedding_matrix), set trainable=False
SimpleRNN, 100 units, return_sequences is True: https://www.tensorflow.org/api_docs/python/tf/keras/layers/SimpleRNN?hl=en
SimpleRNN, 50 units
Dense, 128 units, activation tanh
Dense, 64 units, activation tanh
Dense, len(class_names), activation softmax
"""

from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding
from tensorflow.keras.initializers import Constant

# Sample model 1: normal RNN

def create_simple_rnn_model():
    model = None
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

simple_rnn_model = create_simple_rnn_model()
simple_rnn_model.summary()

In [None]:
history_1 = simple_rnn_model.fit(train_samples, train_labels, 
                                 batch_size=128, epochs=5, validation_data=(val_samples, val_labels), 
                                 callbacks=[model_cp, early_stop])

In [None]:
# Plot accuracy vs epoch
plt.plot(history_1.history['accuracy'])
plt.plot(history_1.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='upper left')

## Create a Bidirectional with GRU

In [None]:
"""
TODO 4: create a bidirectional model
The layers of Sequential are as follows:

Input, has shape (1, ), and dtype is tf.string: https://www.tensorflow.org/api_docs/python/tf/keras/Input?hl=en
"vectorizer" variable (TextVectorization layer defined above)
Embedding layer: input_dim is "num_tokens", output dim is "embedding_dim", embeddings_initializer is Contstant(embedding_matrix), set trainable=False
Bidirectional GRU, 128 units, return_sequences is True: https://www.tensorflow.org/api_docs/python/tf/keras/layers/Bidirectional
Bidirectional GRU, 64 units
Dense, 64 units, activation tanh
Dense, len(class_names), activation softmax
"""

from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import Dense, GRU, Bidirectional

# Sample model 2: Bidirectional with GRU
def create_bidi_gru_model():
    model = None
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

bidi_gru_model = create_bidi_gru_model()
bidi_gru_model.summary()

In [None]:
model_cp2 = tf.keras.callbacks.ModelCheckpoint("model_cp", monitor="val_loss", save_format="tf", save_best_only=True)

In [None]:
history_2 = bidi_gru_model.fit(train_samples, train_labels, 
                               batch_size=128, epochs=10, validation_data=(val_samples, val_labels),
                               callbacks=[model_cp2])

In [None]:
# Plot accuracy vs epoch
plt.plot(history_2.history['accuracy'])
plt.plot(history_2.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='upper left')

In [None]:
# Predict
class_names[np.argmax(bidi_gru_model.predict(np.array([["The PC performance is very bad. You should buy a laptop instead."]])))]

In [None]:
# Create your custom model

def create_custom_model():
    model = None
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

custom_model = create_custom_model()
custom_model.summary()

In [None]:
model_cp3 = tf.keras.callbacks.ModelCheckpoint("model_cp", 
                                               monitor="val_loss", save_format="tf", save_best_only=True)

In [None]:
history_3 = custom_model.fit(train_samples, train_labels, 
                               batch_size=128, epochs=10, validation_data=(val_samples, val_labels),
                             callbacks=[model_cp3, early_stop])