# RNN Model 

In [1]:
import json
import pickle
import pandas as pd
import gensim as gs
import pandas as pd
import numpy as np
import scipy as sc
import nltk
import os
from nltk.tokenize import word_tokenize as wt
from nltk.tokenize import sent_tokenize as st
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.sequence import pad_sequences
import logging
import re
import sys
import random
from collections import Counter
from tensorflow.contrib import keras
from keras.preprocessing import sequence
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.layers import Dense,LSTM,Input,Activation,Add,TimeDistributed,\
Permute,Flatten,RepeatVector,merge,Lambda,Multiply,Reshape
from keras.layers.wrappers import TimeDistributed
from keras.layers.embeddings import Embedding
from keras.models import Sequential,Model
from keras.optimizers import RMSprop
from keras import backend as K
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

Using TensorFlow backend.


In [2]:
#load data
histPath = 'rnn_training_history/'
#load training data
with open(histPath+'train.txt', "rb") as f1, open(histPath+'val.txt', "rb") as f2, open(histPath+'test.txt', "rb") as f3: 
    trainX, trainY = pickle.load(f1)
    valX, valY = pickle.load(f2)
    testX, testY = pickle.load(f3)
#load dictionaries
with open(histPath+'word2idx_master.json', 'r') as f1, open(histPath+'idx2word_master.json', 'r') as f2:
    word2idx = json.load(f1)
    idx2word = json.load(f2)

#load embedding matrix
embeddMatrix = np.load(histPath+'embeddMatrix.npy')

In [3]:
#params for model training
seed = 209
p_W, p_U, p_dense, p_emb, weight_decay = 0, 0, 0, 0, 0
LR = 1e-4
batch_size = 64
num_train_batches = len(trainX) // batch_size
num_val_samples = len(valX) + len(trainX) - batch_size*num_train_batches
num_val_batches = len(valX) // batch_size
total_entries = (num_train_batches + num_val_batches)*batch_size

#maximum length for title 
# tMaxLen = 20
tMaxLen = 50
#maximum length for abstract
aMaxLen = 50
#total maximum length
maxlen = tMaxLen + aMaxLen

batch_norm=False

embeddDim = embeddMatrix.shape[1]
nUnique = embeddMatrix.shape[0]
hidden_units= embeddDim

learning_rate = 0.002
clip_norm = 1.0
regularizer = l2(weight_decay) if weight_decay else None

---

## I. Data Generator

In [8]:
#padding function for abstracts
def padAbstract(x, maxL = aMaxLen, dictionary = word2idx):
    n = len(x)
    if n > maxL:
        x = x[-maxL:]
        n = maxL
    return [dictionary['_']]*(maxL - n) + x + [dictionary['*']]

#build generator for model
def generator(trainX, trainY, batch_size = batch_size, 
              nb_batches = None, model = None, seed = seed):
    
    #UNDERSTAND THIS
    c = nb_batches if nb_batches else 0
    while True:
        titles = list()
        abstracts = list()
        if nb_batches and c >= nb_batches:
            c = 0
        new_seed = random.randint(0, sys.maxsize)
        random.seed(c+123456789+seed)
        
        for b in range(batch_size):
            a = random.randint(0,len(trainX)-1)
            
            #random shuffling of data
            abstract = trainX[a]
            s = random.randint(min(aMaxLen,len(abstract)), max(aMaxLen,len(abstract)))
            abstracts.append(abstract[:s])
            
            title = trainY[a]
            s = random.randint(min(tMaxLen,len(title)), max(tMaxLen,len(title)))
            titles.append(title[:s])

        # undo the seeding before we yield in order not to affect the caller
        c+= 1
        random.seed(new_seed)

        yield conv_seq_labels(abstracts, titles)

#pad sequence and convert title to labels
def conv_seq_labels(abstracts, titles, nflips = None, model = None, dictionary = word2idx):
    """abstract and titles are converted to padded input vectors. Titles are one-hot encoded to labels."""
    batch_size = len(titles)
    
    
    x = [padAbstract(a)+t for a,t in zip(abstracts, titles)] 
    x = sequence.pad_sequences(x, maxlen = maxlen, value = dictionary['_'], 
                               padding = 'post', truncating = 'post')
        
    y = np.zeros((batch_size, tMaxLen, nUnique))
    for i, it in enumerate(titles):
        it = it + [dictionary['*']] + [dictionary['_']]*tMaxLen  # output does have a eos at end
        it = it[:tMaxLen]
        y[i,:,:] = np_utils.to_categorical(it, nUnique)
        
    #The 3 inputs are description, summary starting with eos and a one-hot encoding of the summary categorical variables.
    return [x[:,:aMaxLen],x[:,aMaxLen:]], y

In [9]:
#check generator
check = next(generator(trainX, trainY, batch_size = batch_size))
print(check[0][0].shape,check[0][1].shape,check[1].shape)
print("Abstract  : ", [idx2word[str(i)] for i in check[0][0][5]])
print("Title  : ", [idx2word[str(i)] for i in check[0][1][5]])

(64, 50) (64, 50) (64, 50, 32471)
Abstract  :  ['and', 'surface', 'initialization', 'at', 'test', 'time,', 'without', 'manual', 'effort.', 'self-supervision', 'by', 'back-propagating', 'through', 'differentiable', 'rendering', 'allows', '<ign>', 'adaptation', 'of', 'the', 'model', 'to', 'the', 'test', 'data,', 'and', 'offers', 'much', 'tighter', 'fit', 'than', 'a', 'pretrained', 'fixed', 'model.', 'we', 'show', 'that', 'the', 'proposed', 'model', 'improves', 'with', 'experience', 'and', 'converges', 'to', 'low-error', 'solutions', 'where']
Title  :  ['*', 'self-supervised', 'learning', 'of', 'motion', 'capture', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_']


In [10]:
#generator for training and validation
genTrain = generator(trainX, trainY, batch_size = batch_size)
genVal =  generator(valX, valY, nb_batches = len(valX)// batch_size, batch_size = batch_size)

---

## II. Train Model

In [11]:
#single layger LSTM 
def encoder_decoder(genTrain, genVal, mode = 'fit', num_epochs = 1, 
                    en_shape = aMaxLen, de_shape = tMaxLen):
    
    print('Encoder_Decoder LSTM...')
   
    """__encoder___"""
    encoder_inputs = Input(shape=(en_shape,))
    print(encoder_inputs)
    
    #APPLY EMBEDDING LAYER. https://keras.io/layers/embeddings/       
    input_emb = Embedding(nUnique, embeddDim,
                          input_length = aMaxLen,
                          W_regularizer = regularizer, dropout = p_emb, 
                          weights=[embeddMatrix], mask_zero=True,
                          name='embedding_1')
    
    #ENCODER LSTM - FORWARD   https://keras.io/layers/recurrent/  
    encoder_LSTM = LSTM(hidden_units, dropout_U = 0.2, dropout_W = 0.2 ,return_state=True)
    encoder_LSTM_rev = LSTM(hidden_units,return_state=True,go_backwards=True)
    
    #ENCODER LSTM - REVERSE 
    encoder_outputsR, state_hR, state_cR = encoder_LSTM_rev(input_emb(encoder_inputs))
    encoder_outputs, state_h, state_c = encoder_LSTM(input_emb(encoder_inputs))
        
    state_hfinal=Add()([state_h,state_hR])
    state_cfinal=Add()([state_c,state_cR])
    
    encoder_states = [state_hfinal,state_cfinal]
    
    """____decoder___"""
    #Input to the decoder would be the summary(headline) sequence starting from ~ character.
#     decoder_inputs = Input(shape=(de_shape,))
    decoder_inputs = Input(shape=(en_shape,))
    print(decoder_inputs)
      
    decoder_LSTM = LSTM(hidden_units,return_sequences=True,return_state=True)
    decoder_outputs, _, _ = decoder_LSTM(input_emb(decoder_inputs),initial_state=encoder_states) 
    decoder_dense = Dense(de_shape,activation='linear')
    
    # Apply a dense layer that has vocab_size(40000) outputs which learns probability of each word when softmax is applied.
    # TimeDistributed is a wrapper for applying the same function over all the time step outputs. 
    # Refer https://keras.io/layers/wrappers/
    time_distributed = TimeDistributed(Dense(nUnique,
                                             W_regularizer=regularizer, b_regularizer=regularizer,
                                             name = 'timedistributed_1'))
    activation = Activation('softmax', name='activation_1')
    decoder_outputs = activation(time_distributed(decoder_outputs))
    
    #Model groups layers into an object with training and inference features.
    #https://www.tensorflow.org/api_docs/python/tf/keras/models/Model        
    model= Model(inputs=[encoder_inputs,decoder_inputs], outputs=decoder_outputs)
    
    rmsprop = RMSprop(lr = learning_rate,clipnorm = clip_norm)
    
    model.compile(loss='categorical_crossentropy',optimizer=rmsprop)
    
    for epoch in range(num_epochs):
        model.fit_generator(genTrain,
                            steps_per_epoch = num_train_batches,
                            epochs=5,  #Try different epochs as hyperparameter 
                            validation_data = genVal,
                            validation_steps = num_val_batches)
    
    #_________________________INFERENCE MODE______________________________#  
    
    encoder_model_inf = Model(encoder_inputs,encoder_states)
    
    decoder_state_input_H = Input(shape=(hidden_units,))
    decoder_state_input_C = Input(shape=(hidden_units,)) 
    decoder_state_inputs = [decoder_state_input_H, decoder_state_input_C]
    decoder_outputs, decoder_state_h, decoder_state_c = decoder_LSTM(input_emb(decoder_inputs),
                                                                     initial_state=decoder_state_inputs)
    decoder_states = [decoder_state_h, decoder_state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    
    decoder_model_inf= Model([decoder_inputs]+decoder_state_inputs,
                             [decoder_outputs]+decoder_states)
    
    return model,encoder_model_inf,decoder_model_inf

In [13]:
#let's try this
# model, encoder, decoder = encoder_decoder(genTrain, genVal)