# RNN Model Training with Self-Trained GloVe Embedding

**Karina Huang**

**May 11, 2019**

This notebook records the training history of all models explored for the Title Generation project. 

---

In [1]:
#load packages
import json
import pickle
import string
import pandas as pd
import numpy as np
from numpy import argmax
from keras.preprocessing.sequence import pad_sequences
import re
import sys
import random
from collections import Counter, defaultdict
from tensorflow.contrib import keras
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import backend as K
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

#models 
from rnn_model import getBaseModel, getBidirectionalModel, getAttentionModel, getAttentionLSTMModel

Using TensorFlow backend.


In [2]:
#load data
histPath = ''
#load training data
with open(histPath+'train.txt', "rb") as f1, open(histPath+'val.txt', "rb") as f2, open(histPath+'test.txt', "rb") as f3: 
    trainX, trainY = pickle.load(f1)
    valX, valY = pickle.load(f2)
    testX, testY = pickle.load(f3)
#load dictionaries
with open(histPath+'word2idx_master.json', 'r') as f1, open(histPath+'idx2word_master.json', 'r') as f2:
    word2idx = json.load(f1)
    idx2word = json.load(f2)

#load embedding matrix
embeddMatrix = np.load(histPath+'embeddMatrix.npy')

In [3]:
#params for model training
seed = 209
p_W, p_U, p_dense, p_emb, weight_decay = 0, 0, 0, 0, 0
LR = 1e-4
batch_size = 32

num_train_batches = len(trainX) // batch_size
num_val_samples = len(valX) + len(trainX) - batch_size*num_train_batches
num_val_batches = len(valX) // batch_size
total_entries = (num_train_batches + num_val_batches)*batch_size

#maximum length for title 
tMaxLen = 250
#maximum length for abstract
aMaxLen = 250
#total maximum length
maxlen = tMaxLen + aMaxLen

batch_norm=False

embeddDim = embeddMatrix.shape[1]
nUnique = embeddMatrix.shape[0]
hidden_units= embeddDim

learning_rate = 0.002
clip_norm = 1.0

---

## I. Data Generator

In [4]:
#padding function for abstracts
def padAbstract(x, maxL = aMaxLen, dictionary = word2idx):
    n = len(x)
    if n > maxL:
        x = x[-maxL:]
        n = maxL
    return [dictionary['_']]*(maxL - n) + x + [dictionary['*']]

#build generator for model
def generator(trainX, trainY, batch_size = batch_size, 
              nb_batches = None, model = None, seed = seed):
    
    c = nb_batches if nb_batches else 0
    while True:
        titles = list()
        abstracts = list()
        if nb_batches and c >= nb_batches:
            c = 0
        new_seed = random.randint(0, sys.maxsize)
        random.seed(c+123456789+seed)
        
        for b in range(batch_size):
            a = random.randint(0,len(trainX)-1)
            
            #random shuffling of data
            abstract = trainX[a]
            s = random.randint(min(aMaxLen,len(abstract)), max(aMaxLen,len(abstract)))
            abstracts.append(abstract[:s])
            
            title = trainY[a]
            s = random.randint(min(tMaxLen,len(title)), max(tMaxLen,len(title)))
            titles.append(title[:s])

        # undo the seeding before we yield in order not to affect the caller
        c+= 1
        random.seed(new_seed)

        yield conv_seq_labels(abstracts, titles)

#pad sequence and convert title to labels
def conv_seq_labels(abstracts, titles, nflips = None, model = None, dictionary = word2idx):
    """abstract and titles are converted to padded input vectors. Titles are one-hot encoded to labels."""
    batch_size = len(titles)
    
    
    x = [padAbstract(a)+t for a,t in zip(abstracts, titles)] 
    x = sequence.pad_sequences(x, maxlen = maxlen, value = dictionary['_'], 
                               padding = 'post', truncating = 'post')
        
    y = np.zeros((batch_size, tMaxLen, nUnique))
    for i, it in enumerate(titles):
        it = it + [dictionary['*']] + [dictionary['_']]*tMaxLen  # output does have a eos at end
        it = it[:tMaxLen]
        y[i,:,:] = np_utils.to_categorical(it, nUnique)
        
    #The 3 inputs are abstract, title starting with eos and a one-hot encoding of the title categorical variables.
    return [x[:,:aMaxLen],x[:,aMaxLen:]], y

In [5]:
#check generator
check = next(generator(trainX, trainY, batch_size = batch_size))
print(check[0][0].shape,check[0][1].shape,check[1].shape)
print("Abstract  : ", [idx2word[str(i)] for i in check[0][0][1]])
print("Title  : ", [idx2word[str(i)] for i in check[0][1][1]])

(32, 250) (32, 250) (32, 250, 32471)
Abstract  :  ['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'with', 'the', 'increase', 'in', 'available', 'data', 'parallel', 'machine', 'learning', 'has', '<ign>', '<ign>', 'become', 'an', 'increasingly', 'pressing', 'problem.', 'in', 'this', 'paper', 'we', 'present', '<ign>', '<ign>', 'the', 'first', 'parallel', 'stochastic', 'gradient', 'descent', 'algorithm', 'including', 'a', '<ign>', '<ign>', 'detailed', 'anal

In [6]:
#generator for training and validation
genTrain = generator(trainX, trainY, batch_size = batch_size)
genVal =  generator(valX, valY, nb_batches = len(valX)// batch_size, batch_size = batch_size)

---

## II. Base Model

Encoder-Decoder model, with a single bidirectional LSTM layer in encoder and a unidirectional LSTM layer in decoder.

In [None]:
# K.clear_session()

# #base model
# rnn_base = getBaseModel(genTrain, genVal, embeddMatrix, 
#                         learning_rate, clip_norm, nUnique,
#                         embeddDim, hidden_units)
# #base model summary
# rnn_base.summary()

In [None]:
# #train base model
# filepath = 'rnn_training_history/'

# checkpoint = ModelCheckpoint(filepath + 'rnn_base.h5', monitor = 'val_loss', 
#                              verbose = 1, save_best_only = True, mode = 'min')
# callbacks_list = [checkpoint]

# #fit base model
# rnn_base.fit_generator(genTrain,
#                        steps_per_epoch = num_train_batches,
#                        epochs = 10, 
#                        validation_data = genVal,
#                        validation_steps = num_val_batches,
#                        callbacks = callbacks_list)

In [None]:
# #save model weights
# rnn_base.save_weights(filepath+'rnn_base_weights.h5')

In [7]:
#load base model
rnn_base = load_model('rnn_training_history/rnn_base.h5')
rnn_base.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
encoder_embedd (Embedding)      (None, 250, 100)     3247100     encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) [(None, 200), (None, 160800      encoder_embedd[0][0]             
__________________________________________________________________________________________________
decoder_em

---
## III. Bidirectional Model

Adjusted unidirectional layer in decoder to bidirectional.

In [None]:
# #train bidirectional model
# rnn_bidirectional = getBidirectionalModel(genTrain, genVal, embeddMatrix, 
#                                           learning_rate, clip_norm, nUnique,
#                                           embeddDim, hidden_units)

In [None]:
# # define the checkpoint
# filepath = "rnn_training_history/rnn_bidirectional_checkpoint.h5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# callbacks_list = [checkpoint]

# #fit model
# rnn_bidirectional.fit_generator(genTrain,
#                                 steps_per_epoch = num_train_batches,
#                                 epochs = 50, 
#                                 validation_data = genVal,
#                                 validation_steps = num_val_batches,
#                                 callbacks = callbacks_list)

In [None]:
# #save model and model weights
# rnn_bidirectional.save('rnn_training_history/rnn_bidirectional.h5')
# rnn_bidirectional.save_weights('rnn_training_history/rnn_bidirectional_weights.h5')

In [8]:
#load bidirectional model
rnn_bidirectional = load_model('rnn_training_history/rnn_bidirectional_checkpoint.h5')
rnn_bidirectional.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
encoder_embedd (Embedding)      (None, 250, 100)     3247100     encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_embedd (Embedding)      (None, 250, 100)     3247100     decoder_input[0][0]              
__________________________________________________________________________________________________
bidirectio

---
## IV. Attention Model

Incorporated an attention/context mechanism in the base model. Attention mechanism combines forward-LSTM encoder outputs with the decoder outputs.

In [None]:
# #train attention model - LSTM
# rnn_attention_lstm = getAttentionLSTMModel(genTrain, genVal, embeddMatrix, 
#                                            learning_rate, clip_norm, nUnique,
#                                            embeddDim, hidden_units)

In [None]:
# # define the checkpoint
# filepath = "rnn_training_history/rnn_attention_lstm_checkpoint.h5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# callbacks_list = [checkpoint]

# #fit model
# rnn_attention_lstm.fit_generator(genTrain,
#                                  steps_per_epoch = num_train_batches,
#                                  epochs = 20, 
#                                  validation_data = genVal,
#                                  validation_steps = num_val_batches,
#                                  callbacks = callbacks_list)

In [None]:
# #save model and model weights
# rnn_attention_lstm.save('rnn_training_history/rnn_attention_lstm.h5')
# rnn_attention_lstm.save_weights('rnn_training_history/rnn_attention_lstm_weights.h5')

In [9]:
#load attention-LSTM model
rnn_attention_lstm = load_model('rnn_training_history/rnn_attention_lstm.h5')
rnn_attention_lstm.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
encoder_embedd (Embedding)      (None, 250, 100)     3247100     encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 250, 100), ( 80400       encoder_embedd[0][0]             
__________________________________________________________________________________________________
lstm_2 (LS

---
## V. Attention Model - Bi-LSTM

Incorporated an attention/context mechanism in the bidirectional model. Attention mechanism combines Bidirectional-LSTM encoder outputs with the decoder outputs.

In [None]:
# K.clear_session()

In [None]:
# #train model with attention
# rnn_attention = getAttentionModel(genTrain, genVal, embeddMatrix, 
#                                   learning_rate, clip_norm, nUnique,
#                                   embeddDim, hidden_units)

# rnn_attention.summary()

In [None]:
# # define the checkpoint
# filepath = "rnn_training_history/rnn_model_attention_2.h5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# callbacks_list = [checkpoint]

# #fit model
# rnn_attention.fit_generator(genTrain,
#                             steps_per_epoch = num_train_batches,
#                             epochs = 50, 
#                             validation_data = genVal,
#                             validation_steps = num_val_batches,
#                             callbacks = callbacks_list)

In [None]:
# #save model and model weights
# rnn_attention.save('rnn_training_history/rnn_attention_2.h5')
# rnn_attention.save_weights('rnn_training_history/rnn_attention_2_weights.h5')

In [10]:
#load model
rnn_attention_bilstm = load_model('rnn_training_history/rnn_attention_2.h5')
rnn_attention_bilstm.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
encoder_embedd (Embedding)      (None, 250, 100)     3247100     encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) [(None, 250, 200), ( 160800      encoder_embedd[0][0]             
__________________________________________________________________________________________________
decoder_em

---
## VI. Predictions

In [11]:
def getPrediction(model, seq, idx2word, tMaxLen, 
                  num_iteration, greedy = True, latitude = 5):
    '''
    Prediction for a given sequence. 
    
    Parameters:
    ===========
    1)model: rnn model
    2)seq: a single abstract, should be a vector of length 250
    3)tMaxLen: maximum length of title, should match with training title input
    4)num_iteration: maximum length allowed for title prediction
    5)idx2word: dictionary for index to word
    6)greedy: default to greedy search predictions, otherwise beam search
    7)latitude: for greedy search, how many top words to consider for random choice
    '''
    
    #cache list of prediction
    prediction = list()
    #initiate title to be a vector of zeros
    init = np.zeros(maxLen)
             
    #for maximum prediction length
    for i in range(num_iteration):
        #get prediction probabilities for all unique words
        predRNN = model.predict([np.reshape(seq, (1, 250)), init.reshape(1, 250)])
        
        if greedy:

            #update next title vector to be the predicted vector
            idx = np.argmax(predRNN[0, i])
            
        else:
            
            #get top number of words
            idxV = np.argsort(predRNN[0, i])[-latitude: ]
            #randomly choose from the top words
            idx = np.random.choice(idxV)
            if i == 0:
                while idx == 1:
                    idx = np.random.choice(idxV)
            else:
                while idx == word2idx[prediction[i-1]]:
                    idx = np.random.choice(idxV)
        
        #index to word
        word = idx2word[str(idx)]
        init[i] = idx
        #if eos tag is predicted
        #break out of loop
        if idx == 1:
            break
        prediction.append(word)
            
    return prediction

In [None]:
#load base model
rnn_base = load_model('rnn_training_history/rnn_base.h5')

#get predictions for attention model

basePred = defaultdict(list)

testX_padded = sequence.pad_sequences(testX, 250, value = word2idx['_'], padding = 'pre')

for i in range(21):
    
    seq = testX_padded[i]
    truth = [idx2word[str(x)]for x in testY[i]]
    greedy_pred = getPrediction(rnn_base, seq, idx2word, 250, 20, greedy = True)
    non_greedy_pred = getPrediction(rnn_base, seq, idx2word, 250, 20, greedy = False)
    space = ' '
    basePred['Truth'].append(space.join(truth))
    basePred['Greedy'].append(space.join(greedy_pred))
    basePred['Non-Greedy'].append(space.join(non_greedy_pred))
    
    #checkpoint
    print(i)
#     if i % 100 == 0:
#         print(i // 100)

In [None]:
#get predictions for bidirectional model

biLSTMPred = defaultdict(list)

testX_padded = sequence.pad_sequences(testX, 250, value = word2idx['_'], padding = 'pre')

for i in range(21):
    
    seq = testX_padded[i]
    truth = [idx2word[str(x)]for x in testY[i]]
    greedy_pred = getPrediction(rnn_bidirectional, seq, idx2word, 250, 20, greedy = True)
    non_greedy_pred = getPrediction(rnn_bidirectional, seq, idx2word, 250, 20, greedy = False)
    space = ' '
    biLSTMPred['Truth'].append(space.join(truth))
    biLSTMPred['Greedy'].append(space.join(greedy_pred))
    biLSTMPred['Non-Greedy'].append(space.join(non_greedy_pred))
    
    #checkpoint
    print(i)

In [None]:
for i in range(len(biLSTMPred['Truth'])):
    print('Truth: ', biLSTMPred['Truth'][i])
    print('Greedy Search: ', biLSTMPred['Greedy'][i])
    print('Non-Greedy Search: ', biLSTMPred['Non-Greedy'][i])
    print('------------------------------------------------')

In [None]:
#get predictions for attention lstm model

attLSTMPred = defaultdict(list)

testX_padded = sequence.pad_sequences(testX, 250, value = word2idx['_'], padding = 'pre')

for i in range(21):
    
    seq = testX_padded[i]
    truth = [idx2word[str(x)]for x in testY[i]]
    greedy_pred = getPrediction(rnn_attention_lstm, seq, idx2word, 250, 20, greedy = True)
    non_greedy_pred = getPrediction(rnn_attention_lstm, seq, idx2word, 250, 20, greedy = False)
    space = ' '
    attLSTMPred['Truth'].append(space.join(truth))
    attLSTMPred['Greedy'].append(space.join(greedy_pred))
    attLSTMPred['Non-Greedy'].append(space.join(non_greedy_pred))
    
    #checkpoint
    print(i)

In [None]:
for i in range(len(attLSTMPred['Truth'])):
    print('Truth: ', attLSTMPred['Truth'][i])
    print('Greedy Search: ', attLSTMPred['Greedy'][i])
    print('Non-Greedy Search: ', attLSTMPred['Non-Greedy'][i])
    print('------------------------------------------------')

---
## VII. Random Predictions

Predictions for out-of-dataset text files. 

In [12]:
def predictRandomText(txt, model, greedy = True):
    
    #format input text string 
    txtV = txt.lower().split(' ')
    #cache list for tokenization
    tokenized = list()
    #tokenize text
    for w in txtV:
        try:
            tokenized.append(word2idx[w])
        except:
            tokenized.append(word2idx['<ign>'])
    #pad sequence for prediction
    tokenized_padded = sequence.pad_sequences([tokenized], 250, value = word2idx['_'], padding = 'pre')
    #return prediction given mode
    if greedy:
        pred = getPrediction(model, tokenized_padded[0], idx2word, 250, 20, greedy = True)
    else:
        pred = getPrediction(model, tokenized_padded[0], idx2word, 250, 20, greedy = False)
    
    return pred