# RNN Model 

In [1]:
import json
import pickle
import gensim as gs
import pandas as pd
import numpy as np
import scipy as sc
import nltk
import os
from nltk.tokenize import word_tokenize as wt
from nltk.tokenize import sent_tokenize as st
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.sequence import pad_sequences
import logging
import re
import sys
import random
from collections import Counter
from tensorflow.contrib import keras
from keras.preprocessing import sequence
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.layers import Bidirectional, Dropout, Dense,LSTM,Input,Activation,Add,TimeDistributed,\
Permute,Flatten,RepeatVector,merge,Lambda,Multiply,Reshape, Concatenate
from keras.layers.wrappers import TimeDistributed
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential,Model
from keras.optimizers import RMSprop, Adam
from keras import backend as K
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

#models 
from rnn_model import getBaseModel

Using TensorFlow backend.


In [2]:
#load data
histPath = ''
#load training data
with open(histPath+'train.txt', "rb") as f1, open(histPath+'val.txt', "rb") as f2, open(histPath+'test.txt', "rb") as f3: 
    trainX, trainY = pickle.load(f1)
    valX, valY = pickle.load(f2)
    testX, testY = pickle.load(f3)
#load dictionaries
with open(histPath+'word2idx_master.json', 'r') as f1, open(histPath+'idx2word_master.json', 'r') as f2:
    word2idx = json.load(f1)
    idx2word = json.load(f2)

#load embedding matrix
embeddMatrix = np.load(histPath+'embeddMatrix.npy')

In [3]:
#params for model training
seed = 209
p_W, p_U, p_dense, p_emb, weight_decay = 0, 0, 0, 0, 0
LR = 1e-4
batch_size = 32

num_train_batches = len(trainX) // batch_size
num_val_samples = len(valX) + len(trainX) - batch_size*num_train_batches
num_val_batches = len(valX) // batch_size
total_entries = (num_train_batches + num_val_batches)*batch_size

#maximum length for title 
tMaxLen = 250
#maximum length for abstract
aMaxLen = 250
#total maximum length
maxlen = tMaxLen + aMaxLen

batch_norm=False

embeddDim = embeddMatrix.shape[1]
nUnique = embeddMatrix.shape[0]
hidden_units= embeddDim

learning_rate = 0.002
clip_norm = 1.0
# regularizer = l2(weight_decay) if weight_decay else None

---

## I. Data Generator

In [4]:
#padding function for abstracts
def padAbstract(x, maxL = aMaxLen, dictionary = word2idx):
    n = len(x)
    if n > maxL:
        x = x[-maxL:]
        n = maxL
    return [dictionary['_']]*(maxL - n) + x + [dictionary['*']]

#build generator for model
def generator(trainX, trainY, batch_size = batch_size, 
              nb_batches = None, model = None, seed = seed):
    
    #UNDERSTAND THIS
    c = nb_batches if nb_batches else 0
    while True:
        titles = list()
        abstracts = list()
        if nb_batches and c >= nb_batches:
            c = 0
        new_seed = random.randint(0, sys.maxsize)
        random.seed(c+123456789+seed)
        
        for b in range(batch_size):
            a = random.randint(0,len(trainX)-1)
            
            #random shuffling of data
            abstract = trainX[a]
            s = random.randint(min(aMaxLen,len(abstract)), max(aMaxLen,len(abstract)))
            abstracts.append(abstract[:s])
            
            title = trainY[a]
            s = random.randint(min(tMaxLen,len(title)), max(tMaxLen,len(title)))
            titles.append(title[:s])

        # undo the seeding before we yield in order not to affect the caller
        c+= 1
        random.seed(new_seed)

        yield conv_seq_labels(abstracts, titles)

#pad sequence and convert title to labels
def conv_seq_labels(abstracts, titles, nflips = None, model = None, dictionary = word2idx):
    """abstract and titles are converted to padded input vectors. Titles are one-hot encoded to labels."""
    batch_size = len(titles)
    
    
    x = [padAbstract(a)+t for a,t in zip(abstracts, titles)] 
    x = sequence.pad_sequences(x, maxlen = maxlen, value = dictionary['_'], 
                               padding = 'post', truncating = 'post')
        
    y = np.zeros((batch_size, tMaxLen, nUnique))
    for i, it in enumerate(titles):
        it = it + [dictionary['*']] + [dictionary['_']]*tMaxLen  # output does have a eos at end
        it = it[:tMaxLen]
        y[i,:,:] = np_utils.to_categorical(it, nUnique)
        
    #The 3 inputs are abstract, title starting with eos and a one-hot encoding of the title categorical variables.
    return [x[:,:aMaxLen],x[:,aMaxLen:]], y

In [5]:
#check generator
check = next(generator(trainX, trainY, batch_size = batch_size))
print(check[0][0].shape,check[0][1].shape,check[1].shape)
print("Abstract  : ", [idx2word[str(i)] for i in check[0][0][1]])
print("Title  : ", [idx2word[str(i)] for i in check[0][1][1]])

(32, 250) (32, 250) (32, 250, 32471)
Abstract  :  ['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'with', 'the', 'increase', 'in', 'available', 'data', 'parallel', 'machine', 'learning', 'has', '<ign>', '<ign>', 'become', 'an', 'increasingly', 'pressing', 'problem.', 'in', 'this', 'paper', 'we', 'present', '<ign>', '<ign>', 'the', 'first', 'parallel', 'stochastic', 'gradient', 'descent', 'algorithm', 'including', 'a', '<ign>', '<ign>', 'detailed', 'anal

In [6]:
#generator for training and validation
genTrain = generator(trainX, trainY, batch_size = batch_size)
genVal =  generator(valX, valY, nb_batches = len(valX)// batch_size, batch_size = batch_size)

---

## II. Base Model

In [7]:
K.clear_session()

In [10]:
#base model
rnn_base = getBaseModel(genTrain, genVal, embeddMatrix, 
                        learning_rate, clip_norm, nUnique,
                        embeddDim, hidden_units)
#base model summary
rnn_base.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
encoder_embedd (Embedding)      (None, 250, 100)     3247100     encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) [(None, 200), (None, 160800      encoder_embedd[0][0]             
__________________________________________________________________________________________________
decoder_em

In [12]:
#train base model
filepath = 'rnn_training_history/'

checkpoint = ModelCheckpoint(filepath + 'rnn_base.h5', monitor = 'val_loss', 
                             verbose = 1, save_best_only = True, mode = 'min')
callbacks_list = [checkpoint]

#fit base model
rnn_base.fit_generator(genTrain,
                       steps_per_epoch = num_train_batches,
                       epochs = 10, 
                       validation_data = genVal,
                       validation_steps = num_val_batches,
                       callbacks = callbacks_list)

Epoch 1/10

Epoch 00001: val_loss improved from inf to 6.29829, saving model to rnn_training_history/rnn_base.h5
Epoch 2/10

Epoch 00002: val_loss improved from 6.29829 to 6.19251, saving model to rnn_training_history/rnn_base.h5
Epoch 3/10

Epoch 00003: val_loss improved from 6.19251 to 6.08518, saving model to rnn_training_history/rnn_base.h5
Epoch 4/10

Epoch 00004: val_loss improved from 6.08518 to 5.97910, saving model to rnn_training_history/rnn_base.h5
Epoch 5/10

Epoch 00005: val_loss improved from 5.97910 to 5.92332, saving model to rnn_training_history/rnn_base.h5
Epoch 6/10

Epoch 00006: val_loss improved from 5.92332 to 5.90641, saving model to rnn_training_history/rnn_base.h5
Epoch 7/10

Epoch 00007: val_loss improved from 5.90641 to 5.87158, saving model to rnn_training_history/rnn_base.h5
Epoch 8/10

Epoch 00008: val_loss improved from 5.87158 to 5.86071, saving model to rnn_training_history/rnn_base.h5
Epoch 9/10

Epoch 00009: val_loss improved from 5.86071 to 5.85241, 

<keras.callbacks.History at 0x7fa3f1158438>

In [13]:
#save model weights
rnn_base.save_weights(filepath+'rnn_base_weights.h5')

In [None]:
def getPredictions(model, seq, maxLen, num_iteration, idx2word):
    '''
    Prediction for a given sequence. 
    
    Parameters:
    ===========
    1)model: rnn model
    2)seq: a single abstract, should be a vector of length 250
    3)maxLen: maximum length of predicted title
    4)idx2word: dictionary for index to word
    '''
    
    #cache list of prediction
    prediction = list()
    #initiate title to be a vector of zeros
    init = np.zeros(maxLen)
    
    #for maximum prediction length
    for i in range(num_iteration):
        #get prediction probabilities for all unique words
        predRNN = model.predict([np.reshape(seq, (1, 250)), init.reshape(1, 250)])
        #greedy mode prediction
        #update next title vector to be the predicted vector
        init = np.argmax(predRNN, axis = 2)
        #get probabilities of all unique words
        pVec = predRNN[0, 0, :]
        #get the word with maximum predicted probability as the predicted words
        idx = np.argmax(pVec)
        #index to word
        word = idx2word[str(idx)]
        #if eos tag is predicted
        #break out of loop
        if idx == 1:
            break
        prediction.append(word)
    return prediction

In [38]:
def getPrediction(model, seq, idx2word, maxLen, 
                  num_iteration, greedy = True, latitude = 5):
    '''
    Prediction for a given sequence. 
    
    Parameters:
    ===========
    1)model: rnn model
    2)seq: a single abstract, should be a vector of length 250
    3)maxLen: maximum length of predicted title
    4)idx2word: dictionary for index to word
    5)greedy: default to greedy search predictions, otherwise beam search
    6)latitude: for greedy search, how many top words to consider for random choice
    '''
    
    #cache list of prediction
    prediction = list()
    #initiate title to be a vector of zeros
    init = np.zeros(maxLen)
             
    #for maximum prediction length
    for i in range(num_iteration):
        #get prediction probabilities for all unique words
        predRNN = model.predict([np.reshape(seq, (1, 250)), init.reshape(1, 250)])
        
        if greedy:

            #update next title vector to be the predicted vector
            idx = np.argmax(predRNN[0, i])
            
        else:
            
            #get top number of words
            idxV = np.argsort(predRNN[0, i])[-latitude: ]
            #randomly choose from the top words
            idx = np.random.choice(idxV)
            if i == 0:
                while idx == 1:
                    idx = np.random.choice(idxV)
        
        #index to word
        word = idx2word[str(idx)]
        init[i] = idx
        #if eos tag is predicted
        #break out of loop
        if idx == 1:
            break
        prediction.append(word)
            
    return prediction
        

In [63]:
#check prediction 
check = testX.copy()
check = sequence.pad_sequences(check, 250, value = word2idx['_'], 
                               padding = 'pre')

#example of seq2seq prediction
#true title
print('True Title: ', [idx2word[str(m)] for m in testY[0]])

#true abstract
print()
print('True Abstract: ')
print([idx2word[str(m)] for m in check[0]])

True Title:  ['a', 'concave', 'regularization', 'technique', 'for', 'sparse', 'mixture', 'models']

True Abstract: 
['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'latent', 'variable', 'mixture', 'models', 'are', 'a', 'powerful', 'tool', 'for', 'exploring', 'the', 'structure', 'in', 'large', 'datasets.', 'a', 'common', 'challenge', 'for', 'interpreting', 'such', 'models', 'is', 'a', 'desire', 'to', 'impose', 'sparsity,', 'the', 'natural', 'assumption', 'that', 'each', 'data', 'point', 'only', 'contains', 'few', 'latent', 'features.', 'since', 'mixture', '

In [60]:
#greedy prediction
check_pred = getPrediction(rnn_base, check[0], idx2word, 250, 20, greedy = True)
check_pred

['learning', 'with', 'gaussian', 'process', 'regression']

In [61]:
#beam search prediction
check_pred = getPrediction(rnn_base, check[0], idx2word, 250, 20, greedy = False)
check_pred

['a',
 'bayesian',
 'model',
 'of',
 'latent',
 'markov',
 'models',
 'and',
 'latent',
 'dirichlet',
 'variable',
 'processes',
 'models',
 'and',
 'applications',
 'to',
 'applications',
 'on',
 'applications',
 'to']

---

In [None]:
def getAttentionModel(genTrain, genVal, embeddMatrix,
                      learning_rate, clip_norm, nUnique,
                      embeddDim, hidden_units, encoder_shape = aMaxLen,
                      decoder_shape = tMaxLen):

    '''Base Model - Code Adopted from Computefest'''

    #ENCODER
    #input shape as the vector of sequence, with length padded to 250
    encoder_inputs = Input(shape = (encoder_shape, ), name = 'encoder_input')

    #encode input with embedding layer
    #do not mask 0s because the attention layer does not allow this
    encoder_embedding = Embedding(nUnique, embeddDim,
                                  input_length = encoder_shape,
                                  weights = [embeddMatrix],
                                  mask_zero = False,
                                  name = 'encoder_embedd')(encoder_inputs)

    #1-layer bidirectional LSTM
    encoder_lstm = Bidirectional(LSTM(hidden_units, dropout_U = 0.2, dropout_W = 0.2, 
                                      return_sequences = True, return_state=True))

    #get states from Bi-LSTM
    encoder_outputs, f_h, f_c, b_h, b_c = encoder_lstm(encoder_embedding)

    #add final states together
    state_hfinal=Add()([f_h, b_h])
    state_cfinal=Add()([f_c, b_c])

    #save encoder states
    encoder_states = [state_hfinal,state_cfinal]

    #DECODER
    decoder_inputs = Input(shape = (decoder_shape, ), name = 'decoder_input')

    #encode decoder input with embedding matrix
    decoder_embedding = Embedding(nUnique, embeddDim,
                                  input_length = decoder_shape,
                                  weights = [embeddMatrix],
                                  mask_zero = False,
                                  name = 'decoder_embedd')

    #1-layer lstm
    decoder_lstm = LSTM(hidden_units,return_sequences = True, return_state=True)

    #save decoder outputs
    decoder_outputs, s_h, s_c = decoder_lstm(decoder_embedding(decoder_inputs), 
                                             initial_state = encoder_states)
  
    #ATTENTION
    attention = TimeDistributed(Dense(1, activation = 'tanh'))(encoder_outputs)
    attention = Multiply()([attention, decoder_outputs])
    attention = Activation('softmax')(attention)
    attention = Permute([2, 1])(attention)

    #time distributed layer, probability predictions for all unique words
    decoder_time_distributed = TimeDistributed(Dense(nUnique,
                                                     name = 'decoder_timedistributed'))
    decoder_activation = Activation('softmax', name = 'decoder_activation')
    decoder_outputs = decoder_activation(decoder_time_distributed(decoder_outputs))

    #MODEL
    model = Model(inputs = [encoder_inputs,decoder_inputs], outputs = decoder_outputs)
    rmsprop = RMSprop(lr = learning_rate, clipnorm = clip_norm)
    model.compile(loss = 'categorical_crossentropy',optimizer = rmsprop)
    return model

In [None]:
#encoder
def getModel(genTrain, genVal, embeddMatrix, learning_rate, clip_norm,
             encoder_shape = aMaxLen, decoder_shape = tMaxLen, 
             nUnique = nUnique, embeddDim = embeddDim, hidden_units = hidden_units):
    
    #ENCODER
    #input shape as the vector of sequence, with length padded to 250
    encoder_inputs = Input(shape = (encoder_shape, ), name = 'encoder_input')

    encoder_embedding = Embedding(nUnique, embeddDim, 
                                  input_length = encoder_shape, 
                                  weights = [embeddMatrix],
                                  mask_zero = False,
                                  name = 'encoder_embedd')(encoder_inputs)
    
    encoder_lstm = Bidirectional(LSTM(hidden_units, dropout_U = 0.20,
                                      dropout_W = 0.20, 
                                      return_sequences = True,
                                      return_state=True))
    
    encoder_outputs, f_h, f_c, b_h, b_c = encoder_lstm(encoder_embedding)
    
    state_hfinal=Add()([f_h, b_h])
    state_cfinal=Add()([f_c, b_c])
    
    encoder_states = [state_hfinal,state_cfinal]
        
    #DECODER
    decoder_inputs = Input(shape = (decoder_shape, ), name = 'decoder_input')
    decoder_embedding = Embedding(nUnique, embeddDim, 
                                  input_length = decoder_shape, 
                                  weights = [embeddMatrix],
                                  mask_zero = False,
                                  name = 'decoder_embedd')
    
    decoder_lstm = LSTM(hidden_units, dropout_U = 0.20,
                        dropout_W = 0.20,return_sequences = True, return_state=True)

  
    decoder_outputs, s_h, s_c = decoder_lstm(decoder_embedding(decoder_inputs), 
                                             initial_state = encoder_states)    
    
    #ATTENTION
    attention = TimeDistributed(Dense(1, activation = 'tanh'))(encoder_outputs)
    attention = Multiply()([attention,decoder_outputs])
    attention = Activation('softmax')(attention)
    attention = Permute([2, 1])(attention)
    
    decoder_time_distributed = TimeDistributed(Dense(nUnique,name = 'decoder_timedistributed'))
    decoder_activation = Activation('softmax', name = 'decoder_activation')
    decoder_outputs = decoder_activation(decoder_time_distributed(decoder_outputs))
    
    #MODEL
    model = Model(inputs = [encoder_inputs,decoder_inputs], outputs = decoder_outputs)
    rmsprop = RMSprop(lr = learning_rate, clipnorm = clip_norm)
    model.compile(loss = 'categorical_crossentropy',optimizer = rmsprop)
    return model, 0, 0

In [None]:
rnn, encoder, decoder = getModel(genTrain, genVal, 
                                 embeddMatrix, learning_rate, clip_norm)

In [None]:
rnn.summary()

In [None]:
# define the checkpoint
filepath = "rnn_model_0509_checkpoint.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

#fit model
rnn.fit_generator(genTrain,
                  steps_per_epoch = num_train_batches,
                  epochs=10, 
                  validation_data = genVal,
                  validation_steps = num_val_batches,
                  callbacks=callbacks_list)

In [None]:
#save models
# rnn.save_weights('rnn_weights_0509.h5')
# encoder.save('encoder.h5')
# decoder.save('decoder.h5')

In [None]:
# rnn.summary()

## III. Predictions

In [None]:
idx2word

In [None]:
def getPredictions(model, seq, maxLen, num_iteration, idx2word):
    '''
    Prediction for a given sequence. 
    
    Parameters:
    ===========
    1)model: rnn model
    2)seq: a single abstract, should be a vector of length 250
    3)maxLen: maximum length of predicted title
    4)idx2word: dictionary for index to word
    '''
    
    #cache list of prediction
    prediction = list()
    #initiate title to be a vector of zeros
    init = np.zeros(maxLen)
    
    #for maximum prediction length
    for i in range(num_iteration):
        #get prediction probabilities for all unique words
        predRNN = model.predict([np.reshape(seq, (1, 250)), init.reshape(1, 250)])
        #greedy mode prediction
        #update next title vector to be the predicted vector
        init = np.argmax(predRNN, axis = 2)
        #get probabilities of all unique words
        pVec = predRNN[0, 0, :]
        #get the word with maximum predicted probability as the predicted words
        idx = np.argmax(pVec)
        #index to word
        word = idx2word[str(idx)]
        #if eos tag is predicted
        #break out of loop
        if idx == 1:
            break
        prediction.append(word)
    return prediction

In [None]:
#check prediction 
check = testX.copy()
check = sequence.pad_sequences(check, 250, value = word2idx['_'], 
                               padding = 'pre')

#example of seq2seq prediction
#true title
[idx2word[str(m)] for m in testY[40]]

#true abstract
[idx2word[str(m)] for m in check[40]]

#prediction
check_pred = getPredictions(rnn, check[40], 250, 20, idx2word)
check_pred

---

In [None]:
# # generate target given source sequence
# def predict_sequence(infenc, infdec, source, n_steps, cardinality):
#     # encode
#     state = infenc.predict(np.reshape(source,(1,250)))
#     #start of sequence input
#     target_seq = np.array([0.0 for _ in range(cardinality)]).reshape(1, 1, cardinality)
# #     target_seq = np.zeros((source,1, 250))
#     # collect predictions
#     output = list()
#     for t in range(n_steps):
#         # predict next char
#         yhat, h, c = infdec.predict([target_seq] + state)
#         # store prediction
#         output.append(yhat[0,0,:])
#         # update state
#         state = [h, c]
#         # update target sequence
#         target_seq = yhat
#     return np.array(output)

In [None]:
# #single layger LSTM 
# def encoder_decoder(genTrain, genVal, mode = 'fit', num_epochs = 1, 
#                     en_shape = aMaxLen, de_shape = tMaxLen):
    
# #     print('Encoder_Decoder LSTM...')
   
# #     """__encoder___"""
# #     encoder_inputs = Input(shape=(en_shape,), name='inputE')
# #     print(encoder_inputs)
    
# #     #APPLY EMBEDDING LAYER. https://keras.io/layers/embeddings/       
# #     input_emb = Embedding(nUnique, embeddDim,
# #                           input_length = aMaxLen,
# #                           W_regularizer = regularizer, dropout = p_emb, 
# #                           weights=[embeddMatrix], mask_zero = True,
# #                           name='embedding_1')
    
# #     #ENCODER LSTM - FORWARD   https://keras.io/layers/recurrent/  
# #     encoder_LSTM = LSTM(hidden_units, dropout_U = 0.2, dropout_W = 0.2 ,return_state=True)
# #     encoder_LSTM_rev = LSTM(hidden_units,return_state=True,go_backwards=True)
    
# #     #ENCODER LSTM - REVERSE 
# #     encoder_outputsR, state_hR, state_cR = encoder_LSTM_rev(input_emb(encoder_inputs))
# #     encoder_outputs, state_h, state_c = encoder_LSTM(input_emb(encoder_inputs))
        
# #     state_hfinal=Add()([state_h,state_hR])
# #     state_cfinal=Add()([state_c,state_cR])
    
# #     encoder_states = [state_hfinal,state_cfinal]
    
#     """____decoder___"""
#     #Input to the decoder would be the summary(headline) sequence starting from ~ character.
#     decoder_inputs = Input(shape=(de_shape,), name = 'inputD')
# #     decoder_inputs = Input(shape=(en_shape,))
#     print(decoder_inputs)
      
#     decoder_LSTM = LSTM(hidden_units,return_sequences=True,return_state=True)
#     decoder_outputs, _, _ = decoder_LSTM(input_emb(decoder_inputs),initial_state=encoder_states) 
# #     decoder_dense = Dense(de_shape,activation='linear')
    
#     # Apply a dense layer that has vocab_size(40000) outputs which learns probability of each word when softmax is applied.
#     # TimeDistributed is a wrapper for applying the same function over all the time step outputs. 
#     # Refer https://keras.io/layers/wrappers/
#     decoder_time_distributed = TimeDistributed(Dense(nUnique,
#                                                      W_regularizer=regularizer, 
#                                                      b_regularizer=regularizer,
#                                                      name = 'decoder_timedistributed'))
#     activation = Activation('softmax', name = 'activation_1')
#     decoder_outputs = activation(time_distributed(decoder_outputs))
    
#     #Model groups layers into an object with training and inference features.
#     #https://www.tensorflow.org/api_docs/python/tf/keras/models/Model        
#     model= Model(inputs=[encoder_inputs,decoder_inputs], outputs=decoder_outputs)
    
#     rmsprop = RMSprop(lr = learning_rate,clipnorm = clip_norm)
    
#     model.compile(loss='categorical_crossentropy',optimizer=rmsprop)
    
    
#     for epoch in range(num_epochs):
#         model.fit_generator(genTrain,
#                             steps_per_epoch = num_train_batches,
#                             epochs=5,  #Try different epochs as hyperparameter 
#                             validation_data = genVal,
#                             validation_steps = num_val_batches)
    
#     #_________________________INFERENCE MODE______________________________#  
    
#     encoder_model_inf = Model(encoder_inputs,encoder_states)
    
#     decoder_state_input_H = Input(shape=(hidden_units,))
#     decoder_state_input_C = Input(shape=(hidden_units,)) 
#     decoder_state_inputs = [decoder_state_input_H, decoder_state_input_C]
#     decoder_outputs, decoder_state_h, decoder_state_c = decoder_LSTM(input_emb(decoder_inputs),
#                                                                      initial_state=decoder_state_inputs)
#     decoder_states = [decoder_state_h, decoder_state_c]
#     decoder_outputs = decoder_dense(decoder_outputs)
    
#     decoder_model_inf= Model([decoder_inputs]+decoder_state_inputs,
#                              [decoder_outputs]+decoder_states)
    
#     return model,encoder_model_inf,decoder_model_inf

In [None]:
#let's try this
# model = encoder_decoder(genTrain, genVal)

In [None]:
# model.summary()