# **File structure**

In the first part of this file, we will be concerned only with the training of the model. After the training is done, some generation methods for testing the model will be implemented.

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, SimpleRNN, TimeDistributed, Flatten
from tensorflow.keras.initializers import RandomUniform
from keras.initializers import RandomUniform
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
import pandas as pd
import sys
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import SGD

In [None]:
# import training data
data = tfds.load('amazon_us_reviews/Digital_Software_v1_00', split='train', shuffle_files=True, download=True, batch_size=-1)

In [None]:
# preprocessing step 1
reviews = data["data"]["review_body"].numpy()

for i in range(len(reviews)):
  reviews[i] = reviews[i].decode('utf-8')

In [None]:
# preprocessing: Tokenizer
# issue: https://github.com/keras-team/keras/issues/8092
num_words = 10000 # Max size of the dictionary
tokenizer = Tokenizer(oov_token='UNK', num_words=num_words+1)
tokenizer.fit_on_texts(reviews)
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= num_words} # <= because tokenizer is 1 indexed
tokenizer.word_index[tokenizer.oov_token] = num_words + 1
sequences = tokenizer.texts_to_sequences(reviews)

text = [item for sublist in sequences for item in sublist]
vocab_size = len(tokenizer.word_index)

In [None]:
# make training sequences
seq = []
for i in range(len(text)-2):
    seq.append(text[i:i+2])


reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

trainX = []
trainy = []
for i in seq:
    trainX.append(i[:1])
    trainy.append(i[-1])

In [None]:
# Custom Generator function
def format_data_as_generator(dataX, dataY, i, batch_size, size):

  start_of_batch = i*batch_size % size
  end_of_batch = (i+1)*batch_size % size

  if (start_of_batch < end_of_batch):
    batch_of_x = dataX[start_of_batch:end_of_batch]
    batch_of_y = dataY[start_of_batch:end_of_batch]
  else:
    batch_of_x = dataX[start_of_batch:size] + dataX[0:end_of_batch]
    batch_of_y = dataY[start_of_batch:size] + dataY[0:end_of_batch]

  one_hot_X = []
  for word in batch_of_x:
    empty_one_hot = np.zeros(shape=(vocab_size + 1))
    empty_one_hot[word[0] - 1] = 1
    one_hot_X.append(empty_one_hot)

  one_hot_y = []
  for word in batch_of_y:
    empty_one_hot = np.zeros(shape=(vocab_size + 1))
    empty_one_hot[word - 1] = 1
    one_hot_y.append(empty_one_hot)

  one_hot_X = np.array(one_hot_X).reshape((1,batch_size,vocab_size + 1))
  one_hot_y = np.array(one_hot_y).reshape((1,batch_size,vocab_size + 1))
  return one_hot_X, one_hot_y

def batch_generator(dataX, dataY, batch_size):
  i = 1
  size = len(dataX)

  while True:
    yield format_data_as_generator(dataX, dataY, i-1, batch_size, size)
    i += 1

# ** Note that the cell below should only be run in case of training **

If you would like to only test the model by generating new texts, skip this block

To load a set of weights as a training starting point, uncomment the code line in the middle of this block

In [None]:
batch_size = 4000
epochs = 300
step_size_per_epoch = int(np.floor(len(trainX)/batch_size))
generator_for_training = batch_generator(trainX, trainy, batch_size)

model = Sequential([
    SimpleRNN(100, input_shape=(None, vocab_size + 1),
              return_sequences=True,
              activation='tanh',
              name='hidden'),
    TimeDistributed(Dense(vocab_size + 1, activation='softmax', name='output'))
])

model.compile(loss='mean_squared_error', 
              optimizer=SGD(lr=0.2, momentum=0.85),
              metrics=['categorical_accuracy', 'mean_squared_error'])

# uncomment line below to load earlier saved model weights
# model.load_weights("./model_rnn_weights.065.hdf5")

filepath = "./model_rnn_weights_new.{epoch:03d}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=False, mode='min')
callbacks_list = [checkpoint]

history = model.fit_generator(generator=generator_for_training, 
                              epochs = epochs, 
                              steps_per_epoch = step_size_per_epoch, 
                              callbacks = callbacks_list,
                              max_queue_size = 10,
                              verbose = 1)

# **Text Generation**

The remaining part of this file will take care of the testing of the model by generating new text sequences. If you only want to train the model you can stop here.

Note that you should have run all of the cells above, except the specific training cells in order for the code below to work.

In [None]:
# generation of new text based on argmax method
def genArgMaxRNN(model,seq,max_len = 20):

    tokenized_sent = tokenizer.texts_to_sequences([seq])

    for word in tokenized_sent:
      one_hot = np.zeros(shape=(vocab_size + 1))
      one_hot[word[0] - 1] = 1
      last = model.predict(np.asarray([one_hot]).reshape((1,1,vocab_size + 1)))

      wordNum = last[0][0].argmax() + 1;
      tokenized_sent[0].append(wordNum)

    for _ in range(max_len):
      one_hot = np.zeros(shape=(vocab_size + 1))
      one_hot[wordNum - 1] = 1
      last = model.predict(np.asarray([one_hot]).reshape((1,1,vocab_size + 1)))
      wordNum = last[0][0].argmax() + 1;
      tokenized_sent[0].append(wordNum)
        
    return " ".join(map(lambda x : reverse_word_map[x],tokenized_sent[0]))

In [None]:
# generation of new text based on probability distribution sampling method
def genProbDistRNN(model,seq,max_len = 20):

    tokenized_sent = tokenizer.texts_to_sequences([seq])

    for word in tokenized_sent:
      one_hot = np.zeros(shape=(vocab_size + 1))
      one_hot[word[0] - 1] = 1
      last = model.predict(np.asarray([one_hot]).reshape((1,1,vocab_size + 1)))

      wordNum = np.random.choice(vocab_size + 1, 1, p=last[0][0])[0] + 1
      tokenized_sent[0].append(wordNum)

    for _ in range(max_len):
      one_hot = np.zeros(shape=(vocab_size + 1))
      one_hot[wordNum - 1] = 1
      last = model.predict(np.asarray([one_hot]).reshape((1,1,vocab_size + 1)))
      wordNum = np.random.choice(vocab_size + 1, 1, p=last[0][0])[0] + 1
      tokenized_sent[0].append(wordNum)
      
    return " ".join(map(lambda x : reverse_word_map[x],tokenized_sent[0]))

In [None]:
# Test data is loaded and saved from:
# tfds.load('amazon_us_reviews/Digital_Video_Games_v1_00', split='train', shuffle_files=True, download=True, batch_size=-1)

# preprocess test data
numResults = 25
inputs = []
i = 0

while (len(inputs) < numResults):
  text_file = text_file = open("./data/dat_" + str(i) + ".txt", "r")
  data = text_file.readlines()
  dataTok = data[0].split()
  if (len(dataTok) >= 19):
    inputs.append(' '.join(dataTok[:19]))
  i += 1

In [None]:
# generate new texts
model_out_rnn = Sequential([
    SimpleRNN(100, input_shape=(None, vocab_size + 1),
              return_sequences=True,
              activation='tanh',
              name='hidden'),
    TimeDistributed(Dense(vocab_size + 1, activation='softmax', name='output'))
])

print("\n\x1b[31mModel after training 65 epochs\x1b[0m\n")
model_out_rnn.load_weights("./model_rnn_weights.065.hdf5")

for seq in inputs:
  print(seq)
  print(genArgMaxRNN(model_out_rnn, seq, 20))
  print(genProbDistRNN(model_out_rnn, seq, 25))
  print()