# RNN Model Training with Self-Trained GloVe Embedding

**Karina Huang**

**May 11, 2019**

This notebook records the training history of all models explored for the Title Generation project. 

---

In [1]:
#load packages
import json
import pickle
import string
import pandas as pd
import numpy as np
from numpy import argmax
from keras.preprocessing.sequence import pad_sequences
import re
import sys
import random
from collections import Counter, defaultdict
from tensorflow.contrib import keras
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import backend as K
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

#models 
from rnn_model import getBaseModel, getBidirectionalModel, getAttentionModel, getAttentionLSTMModel

Using TensorFlow backend.


In [2]:
#load data
histPath = '../rnn_preprocessing_history/'
#load training data
with open(histPath+'train.txt', "rb") as f1, open(histPath+'val.txt', "rb") as f2, open(histPath+'test.txt', "rb") as f3: 
    trainX, trainY = pickle.load(f1)
    valX, valY = pickle.load(f2)
    testX, testY = pickle.load(f3)
#load dictionaries
with open(histPath+'word2idx_master.json', 'r') as f1, open(histPath+'idx2word_master.json', 'r') as f2:
    word2idx = json.load(f1)
    idx2word = json.load(f2)

#load embedding matrix
embeddMatrix = np.load(histPath+'embeddMatrix.npy')

In [3]:
#params for model training
seed = 209
p_W, p_U, p_dense, p_emb, weight_decay = 0, 0, 0, 0, 0
LR = 1e-4
batch_size = 32

num_train_batches = len(trainX) // batch_size
num_val_samples = len(valX) + len(trainX) - batch_size*num_train_batches
num_val_batches = len(valX) // batch_size
total_entries = (num_train_batches + num_val_batches)*batch_size

#maximum length for title 
tMaxLen = 250
#maximum length for abstract
aMaxLen = 250
#total maximum length
maxlen = tMaxLen + aMaxLen

batch_norm=False

embeddDim = embeddMatrix.shape[1]
nUnique = embeddMatrix.shape[0]
hidden_units= embeddDim

learning_rate = 0.002
clip_norm = 1.0

---

## I. Data Generator

In [4]:
#padding function for abstracts
def padAbstract(x, maxL = aMaxLen, dictionary = word2idx):
    n = len(x)
    if n > maxL:
        x = x[-maxL:]
        n = maxL
    return [dictionary['_']]*(maxL - n) + x + [dictionary['*']]

#build generator for model
def generator(trainX, trainY, batch_size = batch_size, 
              nb_batches = None, model = None, seed = seed):
    
    c = nb_batches if nb_batches else 0
    while True:
        titles = list()
        abstracts = list()
        if nb_batches and c >= nb_batches:
            c = 0
        new_seed = random.randint(0, sys.maxsize)
        random.seed(c+123456789+seed)
        
        for b in range(batch_size):
            a = random.randint(0,len(trainX)-1)
            
            #random shuffling of data
            abstract = trainX[a]
            s = random.randint(min(aMaxLen,len(abstract)), max(aMaxLen,len(abstract)))
            abstracts.append(abstract[:s])
            
            title = trainY[a]
            s = random.randint(min(tMaxLen,len(title)), max(tMaxLen,len(title)))
            titles.append(title[:s])

        # undo the seeding before we yield in order not to affect the caller
        c+= 1
        random.seed(new_seed)

        yield conv_seq_labels(abstracts, titles)

#pad sequence and convert title to labels
def conv_seq_labels(abstracts, titles, nflips = None, model = None, dictionary = word2idx):
    """abstract and titles are converted to padded input vectors. Titles are one-hot encoded to labels."""
    batch_size = len(titles)
    
    
    x = [padAbstract(a)+t for a,t in zip(abstracts, titles)] 
    x = sequence.pad_sequences(x, maxlen = maxlen, value = dictionary['_'], 
                               padding = 'post', truncating = 'post')
        
    y = np.zeros((batch_size, tMaxLen, nUnique))
    for i, it in enumerate(titles):
        it = it + [dictionary['*']] + [dictionary['_']]*tMaxLen  # output does have a eos at end
        it = it[:tMaxLen]
        y[i,:,:] = np_utils.to_categorical(it, nUnique)
        
    #The 3 inputs are abstract, title starting with eos and a one-hot encoding of the title categorical variables.
    return [x[:,:aMaxLen],x[:,aMaxLen:]], y

In [5]:
#check generator
check = next(generator(trainX, trainY, batch_size = batch_size))
print(check[0][0].shape,check[0][1].shape,check[1].shape)
print("Abstract  : ", [idx2word[str(i)] for i in check[0][0][1]])
print("Title  : ", [idx2word[str(i)] for i in check[0][1][1]])

(32, 250) (32, 250) (32, 250, 32471)
Abstract  :  ['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'with', 'the', 'increase', 'in', 'available', 'data', 'parallel', 'machine', 'learning', 'has', '<ign>', '<ign>', 'become', 'an', 'increasingly', 'pressing', 'problem.', 'in', 'this', 'paper', 'we', 'present', '<ign>', '<ign>', 'the', 'first', 'parallel', 'stochastic', 'gradient', 'descent', 'algorithm', 'including', 'a', '<ign>', '<ign>', 'detailed', 'anal

In [6]:
#generator for training and validation
genTrain = generator(trainX, trainY, batch_size = batch_size)
genVal =  generator(valX, valY, nb_batches = len(valX)// batch_size, batch_size = batch_size)

---

## II. Base Model

Encoder-Decoder model, with a single bidirectional LSTM layer in encoder and a unidirectional LSTM layer in decoder.

In [None]:
# K.clear_session()

# #base model
# rnn_base = getBaseModel(genTrain, genVal, embeddMatrix, 
#                         learning_rate, clip_norm, nUnique,
#                         embeddDim, hidden_units)
# #base model summary
# rnn_base.summary()

In [None]:
# #train base model
# filepath = 'rnn_training_history/'

# checkpoint = ModelCheckpoint(filepath + 'rnn_base.h5', monitor = 'val_loss', 
#                              verbose = 1, save_best_only = True, mode = 'min')
# callbacks_list = [checkpoint]

# #fit base model
# rnn_base.fit_generator(genTrain,
#                        steps_per_epoch = num_train_batches,
#                        epochs = 10, 
#                        validation_data = genVal,
#                        validation_steps = num_val_batches,
#                        callbacks = callbacks_list)

In [None]:
# #save model weights
# rnn_base.save_weights(filepath+'rnn_base_weights.h5')

In [7]:
#load base model
rnn_base = load_model('../rnn_training_history/rnn_base.h5')
rnn_base.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
encoder_embedd (Embedding)      (None, 250, 100)     3247100     encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, 250)          0                                       

---
## III. Bidirectional Model

Adjusted unidirectional layer in decoder to bidirectional.

In [None]:
# #train bidirectional model
# rnn_bidirectional = getBidirectionalModel(genTrain, genVal, embeddMatrix, 
#                                           learning_rate, clip_norm, nUnique,
#                                           embeddDim, hidden_units)

In [None]:
# # define the checkpoint
# filepath = "rnn_training_history/rnn_bidirectional_checkpoint.h5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# callbacks_list = [checkpoint]

# #fit model
# rnn_bidirectional.fit_generator(genTrain,
#                                 steps_per_epoch = num_train_batches,
#                                 epochs = 50, 
#                                 validation_data = genVal,
#                                 validation_steps = num_val_batches,
#                                 callbacks = callbacks_list)

In [None]:
# #save model and model weights
# rnn_bidirectional.save('rnn_training_history/rnn_bidirectional.h5')
# rnn_bidirectional.save_weights('rnn_training_history/rnn_bidirectional_weights.h5')

In [8]:
#load bidirectional model
rnn_bidirectional = load_model('../rnn_training_history/rnn_bidirectional.h5')
rnn_bidirectional.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
encoder_embedd (Embedding)      (None, 250, 100)     3247100     encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_embedd (Embedding)      (None, 250, 100)     3247100     decoder_input[0][0]              
__________________________________________________________________________________________________
bidirectio

---
## IV. Attention Model

Incorporated an attention/context mechanism in the base model. Attention mechanism combines forward-LSTM encoder outputs with the decoder outputs.

In [None]:
# #train attention model - LSTM
# rnn_attention_lstm = getAttentionLSTMModel(genTrain, genVal, embeddMatrix, 
#                                            learning_rate, clip_norm, nUnique,
#                                            embeddDim, hidden_units)

In [None]:
# # define the checkpoint
# filepath = "rnn_training_history/rnn_attention_lstm_checkpoint.h5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# callbacks_list = [checkpoint]

# #fit model
# rnn_attention_lstm.fit_generator(genTrain,
#                                  steps_per_epoch = num_train_batches,
#                                  epochs = 20, 
#                                  validation_data = genVal,
#                                  validation_steps = num_val_batches,
#                                  callbacks = callbacks_list)

In [None]:
# #save model and model weights
# rnn_attention_lstm.save('rnn_training_history/rnn_attention_lstm.h5')
# rnn_attention_lstm.save_weights('rnn_training_history/rnn_attention_lstm_weights.h5')

In [9]:
#load attention-LSTM model
rnn_attention_lstm = load_model('../rnn_training_history/rnn_attention_lstm.h5')
rnn_attention_lstm.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
encoder_embedd (Embedding)      (None, 250, 100)     3247100     encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 250, 100), ( 80400       encoder_embedd[0][0]             
__________________________________________________________________________________________________
lstm_2 (LS

---
## V. Attention Model - Bi-LSTM

Incorporated an attention/context mechanism in the bidirectional model. Attention mechanism combines Bidirectional-LSTM encoder outputs with the decoder outputs.

In [None]:
# K.clear_session()

In [None]:
# #train model with attention
# rnn_attention = getAttentionModel(genTrain, genVal, embeddMatrix, 
#                                   learning_rate, clip_norm, nUnique,
#                                   embeddDim, hidden_units)

# rnn_attention.summary()

In [None]:
# # define the checkpoint
# filepath = "rnn_training_history/rnn_model_attention_2.h5"
# checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
# callbacks_list = [checkpoint]

# #fit model
# rnn_attention.fit_generator(genTrain,
#                             steps_per_epoch = num_train_batches,
#                             epochs = 50, 
#                             validation_data = genVal,
#                             validation_steps = num_val_batches,
#                             callbacks = callbacks_list)

In [None]:
# #save model and model weights
# rnn_attention.save('rnn_training_history/rnn_attention_2.h5')
# rnn_attention.save_weights('rnn_training_history/rnn_attention_2_weights.h5')

In [10]:
#load model
rnn_attention_bilstm = load_model('../rnn_training_history/rnn_attention_2.h5')
rnn_attention_bilstm.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
encoder_embedd (Embedding)      (None, 250, 100)     3247100     encoder_input[0][0]              
__________________________________________________________________________________________________
decoder_input (InputLayer)      (None, 250)          0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) [(None, 250, 200), ( 160800      encoder_embedd[0][0]             
__________________________________________________________________________________________________
decoder_em

---
## VI. Predictions

In [14]:
def getPrediction(model, seq, idx2word, tMaxLen, 
                  num_iteration, greedy = True, latitude = 5):
    '''
    Prediction for a given sequence. 
    
    Parameters:
    ===========
    1)model: rnn model
    2)seq: a single abstract, should be a vector of length 250
    3)tMaxLen: maximum length of title, should match with training title input
    4)num_iteration: maximum length allowed for title prediction
    5)idx2word: dictionary for index to word
    6)greedy: default to greedy search predictions, otherwise beam search
    7)latitude: for greedy search, how many top words to consider for random choice
    '''
    
    #cache list of prediction
    prediction = list()
    #initiate title to be a vector of zeros
    init = np.zeros(tMaxLen)
             
    #for maximum prediction length
    for i in range(num_iteration):
        #get prediction probabilities for all unique words
        predRNN = model.predict([np.reshape(seq, (1, 250)), init.reshape(1, 250)])
        
        if greedy:

            #update next title vector to be the predicted vector
            idx = np.argmax(predRNN[0, i])
            
        else:
            
            #get top number of words
            idxV = np.argsort(predRNN[0, i])[-latitude: ]
            #randomly choose from the top words
            idx = np.random.choice(idxV)
            if i == 0:
                while idx == 1:
                    idx = np.random.choice(idxV)
            else:
                while idx == word2idx[prediction[i-1]]:
                    idx = np.random.choice(idxV)
        
        #index to word
        word = idx2word[str(idx)]
        init[i] = idx
        #if eos tag is predicted
        #break out of loop
        if idx == 1:
            break
        prediction.append(word)
            
    return prediction

In [41]:
#pad test sequence
testX_padded = sequence.pad_sequences(testX, 250, value = word2idx['_'], padding = 'pre')

def getPredictionHistory(model, maxLen, num_iter, Y, 
                         X = testX_padded, idx2word = idx2word):
    
    hist = defaultdict(list)
    
    for i in range(len(X)):
        seq = X[i]
        truth = [idx2word[str(w)] for w in Y[i]]
        greedy_pred = getPrediction(model, seq, idx2word, maxLen, num_iter, greedy = True)
        non_greedy_pred = getPrediction(model, seq, idx2word, maxLen, num_iter, greedy = False)
        space = ' '
        hist['Truth'].append(space.join(truth))
        hist['Greedy'].append(space.join(greedy_pred))
        hist['Non-Greedy'].append(space.join(non_greedy_pred))
        
        #checkpoint 
        if i % 100 == 0:
            print(i//100)
    return hist

In [42]:
#get predictions for base model
basePred = getPredictionHistory(rnn_base, 250, 20, testY)

0
1
2
3
4
5
6
7
8
9


In [44]:
with open("rnn_predictions/predictions_base",'wb') as f:
    pickle.dump(basePred, f)

In [45]:
# with open("rnn_predictions/predictions_base",'rb') as f:
#     basePred = pickle.load(f)

In [47]:
#get predictions for attention LSTM model
attentionPred = getPredictionHistory(rnn_attention_lstm, 250, 20, testY)

0
1
2
3
4
5
6
7
8
9


In [48]:
# with open("rnn_predictions/predictions_attention",'wb') as f:
#     pickle.dump(attentionPred, f)

---

In [56]:
for i in range(len(basePred['Truth'])):
    print('Truth: ', basePred['Truth'][i])
    print('Base - Greedy: ', basePred['Greedy'][i])
    print('Base - Non-Greedy: ', basePred['Non-Greedy'][i])
    print('Attention - Greedy: ', attentionPred['Greedy'][i])
    print('Attention - Non-Greedy: ', attentionPred['Non-Greedy'][i])
    print('-'*110)
    print('-'*110)

Truth:  a concave regularization technique for sparse mixture models
Base - Greedy:  learning with the variable model of latent variable models
Base - Non-Greedy:  a bayesian inference and model in continuous dirichlet process models for classification with applications to data problems using
Attention - Greedy:  learning of latent dirichlet allocation
Attention - Non-Greedy:  gaussian latent models of gaussian models with partial kernels for graphical
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
Truth:  learning transferrable representations for unsupervised domain adaptation
Base - Greedy:  learning to agglomerate images
Base - Non-Greedy:  neural boltzmann learning via
Attention - Greedy:  a learning of generative models for image
Attention - Non-Greedy:  adversarial convolutional learning of representations 

Attention - Non-Greedy:  linear random models with random fields with a empirical risk minimization and its to
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
Truth:  latent attention for if-then program synthesis
Base - Greedy:  a neural network model for real-time object recognition
Base - Non-Greedy:  learning for image representations with images and its deep
Attention - Greedy:  learning the future of human representations of human data
Attention - Non-Greedy:  for image connectivity using video using a convolutional networks for visual recognition and their
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
Truth:  connecting to the past
Bas

Base - Non-Greedy:  a new model for high-dimensional scale image classification and applications from
Attention - Greedy:  learning from generative models
Attention - Non-Greedy:  learning of generative graphical gaussian graphical processes for
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
Truth:  learning bayesian networks with thousands of variables
Base - Greedy:  learning on the complexity of the complexity of the complexity of the complexity of the complexity of barn
Base - Non-Greedy:  learning with multiple the power and its the empirical risk
Attention - Greedy:  a learning for learning to predict tasks
Attention - Non-Greedy:  to reinforcement bayesian networks for sparse inverse regression
--------------------------------------------------------------------------------------------------------------
---

Attention - Greedy:  on top-k in the presence of convergence of convergence of the presence of convergence
Attention - Non-Greedy:  on robustness and estimation from optimal data
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
Truth:  a comprehensive linear speedup analysis for asynchronous stochastic parallel optimization from zeroth-order to first-order
Base - Greedy:  learning on the stochastic gradient descent
Base - Non-Greedy:  a unified algorithm for the convergence bounds
Attention - Greedy:  on the consistency of convergence of the convergence of convergence of convergence of convergence of convergence of convergence of convergence
Attention - Non-Greedy:  on a new method of convergence in stochastic gradient method to gradient method with applications and its a sparse inference
---------------------------

Base - Greedy:  learning to route a unified approach to a neural network for training to pattern recognition
Base - Non-Greedy:  to a connectionist model of the brain-like training to its learning
Attention - Greedy:  the analysis of the power of the presence of the presence of the presence of
Attention - Non-Greedy:  the power in learning of continuous similarity functions and their applications of the barn prior and other a novel algorithm
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
Truth:  scalable kernels for graphs with continuous attributes
Base - Greedy:  learning with multiple tasks
Base - Non-Greedy:  a new algorithm with continuous random process processes via
Attention - Greedy:  learning the consistency of the fourier norm
Attention - Non-Greedy:  sparse learning via linear programming
-------------

--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
Truth:  towards a learning-theoretic analysis of spike-timing dependent plasticity
Base - Greedy:  learning to 3d object recognition with a novel model
Base - Non-Greedy:  a model of a novel network approach to pattern and recognition of neurons with a single model and its the
Attention - Greedy:  learning learning with the connectivity of the brain of the brain of the brain
Attention - Non-Greedy:  the of information in neural population memory from data with partial observations and
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
Truth:  learning to rank by optimizing ndcg measure
Base - Greedy:

Base - Greedy:  learning a neural network for neural network for visual recognition
Base - Non-Greedy:  to the brain and its the training of a computational neural net
Attention - Greedy:  a neural network for visual recognition recognition
Attention - Non-Greedy:  deep learning for visual connectivity in spiking visual data in the brain
--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
Truth:  using options and covariance testing for long horizon off-policy policy evaluation
Base - Greedy:  learning with the general unified of laplacians
Base - Non-Greedy:  on a general efficient algorithm method
Attention - Greedy:  on the complexity of the robustness of the complexity of the complexity of the complexity of the presence of the
Attention - Non-Greedy:  the consistency analysis in a model to kernel matrix completion 

In [83]:
basePred['Truth'].index('toward robustness against label noise in training deep discriminative neural networks')

846

In [84]:
space.join([idx2word[str(w)] for w in testX[846]])

'collecting large training datasets, annotated with high-quality labels, is costly and time-consuming. this paper proposes a novel framework for training deep convolutional neural networks from noisy labeled datasets that can be obtained cheaply. the problem is formulated using an undirected graphical model that represents the relationship between noisy and clean labels, trained in a semi-supervised setting. in our formulation, the inference over latent clean labels is tractable and is regularized during training using auxiliary sources of information. the proposed model is applied to the image labeling problem and is shown to be effective in labeling unseen images as well as reducing label noise in training on cifar-10 and ms coco datasets.'

---
## VII. Random Predictions

Predictions for out-of-dataset text files. 

In [85]:
def predictRandomText(txt, model, greedy = True):
    
    #format input text string 
    txtV = txt.lower().split(' ')
    #cache list for tokenization
    tokenized = list()
    #tokenize text
    for w in txtV:
        try:
            tokenized.append(word2idx[w])
        except:
            tokenized.append(word2idx['<ign>'])
    #pad sequence for prediction
    tokenized_padded = sequence.pad_sequences([tokenized], 250, value = word2idx['_'], padding = 'pre')
    #return prediction given mode
    if greedy:
        pred = getPrediction(model, tokenized_padded[0], idx2word, 250, 20, greedy = True)
    else:
        pred = getPrediction(model, tokenized_padded[0], idx2word, 250, 20, greedy = False)
    
    return pred

In [93]:
predictRandomText('thank you', rnn_base, greedy = True)

['neural', 'networks']

In [107]:
predictRandomText('karina', rnn_base, greedy = True)

['learning', 'to', 'predict', 'chaos']

In [108]:
predictRandomText('phoebe', rnn_base, greedy = True)

['learning', 'to', 'predict', 'chaos']

In [109]:
predictRandomText('abhi', rnn_base, greedy = True)

['learning', 'to', 'predict', 'chaos']

In [116]:
ds = '''Data Science 2 is the second half of a one-year introduction to data science. 
        Building upon the material in Data Science 1, the course introduces advanced methods 
        for data wrangling, data visualization, and deep neural networks, statistical modeling, 
        and prediction. Topics include big data and database management,  multiple deep learning 
        subjects such as CNNs, RNNs, autoencoders, and generative models as well as basic Bayesian 
        methods, nonlinear statistical models and unsupervised learning.'''

In [127]:
predictRandomText(ds, rnn_attention_lstm, greedy = False)

['learning', 'with', 'recurrent', 'convolutional', 'network']