# Pre-requisites

In [0]:
# Connecting to Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# importing all libraries

import string

from keras.preprocessing.text import Tokenizer

from numpy import array
import numpy as np
from keras.utils import to_categorical

from pickle import dump

from keras.preprocessing.sequence import pad_sequences

from pickle import load
from keras.models import load_model
from random import randint

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

from keras.utils import plot_model

from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation

import matplotlib.pyplot as plt

from math import floor

from keras.callbacks import ModelCheckpoint

import keras



Using TensorFlow backend.


# Functions

In [0]:
# Purpose: Load doc into memory
# Input: file name
# Output: text

def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text


In [0]:
# Purpose: turn a doc into clean tokens
# Input: document
# Output: a list of tokens that are all in lowercase, have no special characters, split at white spaces

def clean_doc(doc):
	# replace '--' with a space ' '
	doc = doc.replace('--', ' ')
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token ; string-to-replace, string-to-be-replaced-with, string-to-delete
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens



In [0]:
# Purpose: organize into sequences of tokens
# Input: document tokens, input sequence length
# Output: document broken down into sequences seperated by WS

def create_sequences(tokens, seq_length):
  print('Input sequence length is %d'% seq_length)
  # length = in_length + out_length
  total_length = seq_length+1
  #declare a variable to hold the sequences
  doc_sequences = list()
  for i in range(total_length, len(tokens)):
    # list of tokens; list size = total sequence length
    curr_line_seq = tokens[i-total_length:i]
    # create one string sequence seperated by WS
    curr_line = ' '.join(curr_line_seq)
    # append it to list of such sequences
    doc_sequences.append(curr_line)
  return doc_sequences



In [0]:
# Purpose: save a document
# Input: document , filename
# Output: none

def save_doc(doc, doc_filename):
	data = '\n'.join(doc)
	file = open(doc_filename, 'w')
	file.write(data)
	file.close()
 


In [0]:
# Purpose: load a file containing document as sequence
# Input: filename
# Output: document sequences

def load_sequenced_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	saved_doc_sequences = file.read()
	# close the file
	file.close()
	return saved_doc_sequences



In [0]:
# Purpose: convert sequences to integer vectors
# Input: list of sequences (text)
# Output: list of sequences (int) (list of lists) , vocabulary size
# Uses: save_tokenizer_artifacts

def text_to_int_tokenize(text_seq, tokenizer_name):
  # tokenize
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(text_seq)
  int_seq = tokenizer.texts_to_sequences(text_seq)

  # find the vocab size
  vocab_size = len(tokenizer.word_index) + 1
  save_tokenizer_artifacts(tokenizer, tokenizer_name)
  return int_seq, vocab_size



In [0]:
# Purpose: Create input and labels
# Input: tokenized document sequences (), size of vocabulary
# Output: inputs (), Labels ()

def create_inputs_and_labels(tokenized_doc_seq, vocab_size):
  # convert list to array
  array_tokenized_doc_seq = array(tokenized_doc_seq)
  inputs = array_tokenized_doc_seq[:,:-1]
  labels = array_tokenized_doc_seq[:,-1]
  # one hot encode labels; # columns = vocab size
  labels = to_categorical(labels, num_classes=vocab_size)
  return inputs,labels



In [0]:
# Purpose: Save model and tokenizer to file
# Input: 
# Output: 

def save_tokenizer_artifacts(tokenizer, tokenizer_name):
  
  # save the tokenizer
  dump(tokenizer, open(tokenizer_name, 'wb'))
  


In [0]:
# Purpose: Save model and tokenizer to file
# Input: 
# Output: 

def save_model_artifacts(model_name, model):
  # save the model to file
  model.save(model_name)
  
  

In [0]:
# Purpose: takes in the model, tokenizer, seed text, length of sequence and # words to be predicted
# Input: 
# Output: concatenated predicted words

def generate_seq(model, tokenizer, input_seq_length, in_text, n_words):
	result = list()
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=input_seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)



In [0]:
# Purpose: 
# Input: name of the file containing the test sequences, name of the model and the tokenizer
# Output: seed text and generated text
# Uses: generate_seq()

def predict_from_seed_data(in_filename, model_name, tokenizer_name):
  
  # load cleaned text sequences: list oflists
  text_sequence_doc = load_doc(in_filename)
  text_sequence_lines = text_sequence_doc.split('\n')
  # text_sequence_lines = ['it was my fault to think that the little boy was poor 
  #                       and unhappy I could not see the sad look the wicked woman 
  #                       used to give me as she handed me my bread unfortunately I was 
  #                       always hungry he emptied his glass while reading the newspaper 
  #                       he laid his napkin'] used this for testing
  
  # load the model
  my_model = load_model(model_name)
  
  # load the tokenizer
  my_tokenizer = load(open(tokenizer_name, 'rb'))
  
  seed_text = text_sequence_lines[randint(0,len(text_sequence_lines))]
  print(seed_text + '\n')
  
  input_seq_length = len(text_sequence_lines[0].split()) - 1
  generated_text = generate_seq(my_model, my_tokenizer, input_seq_length, seed_text, 50)
  
  return seed_text, generated_text



In [0]:
# utility function


def print_model(model, model_name):

  plot_model(model, 
             show_shapes=True, 
             show_layer_names=True, 
             to_file = model_name)
  
  from google.colab import files
  files.download(model_name)
  
  

  
  

In [0]:
def plot_acc_loss(model):
  plt.plot(model.history['acc'])
  plt.plot(model.history['val_acc'])
  plt.title('model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.show()

  plt.plot(model.history['loss'])
  plt.plot(model.history['val_loss'])
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.show()



# Models

**Model 1: Embedded - LSTM - Dense**

Lets implement early stopping here since this model is quite slow even with smaller texts to begin with.

Note: Might want to restore the defaults so that it can be used for bench marking

In [0]:
# Dependencies on inputs: Vocabulary size(from tokenizer), input sequence length
# tweakable factors: Output dim of embedding layer that determines the compactness of the embedding vector

def Embed_LSTM_Dense(vocab_size, input_seq_length):
  model = Sequential()
  model.add(Embedding(input_dim = vocab_size, 
                      output_dim = 50, 
                      input_length=input_seq_length))
  model.add(LSTM(20, return_sequences=True))
  model.add(LSTM(10))
  model.add(Dense(100, activation='relu'))
  model.add(Dense(vocab_size, activation='softmax'))
  return model



In [0]:
# run this after all the data is prepped

# Create
Embed_LSTM_Dense_model = Embed_LSTM_Dense(sizeof_vocab, X_data.shape[1])

# print summary
print(Embed_LSTM_Dense_model.summary())

# compile model
Embed_LSTM_Dense_model.compile(loss='categorical_crossentropy', 
                               optimizer='adam', 
                               metrics=['accuracy'])

# fit model
Embed_LSTM_Dense_model_history = Embed_LSTM_Dense_model.fit(X_data, 
                                                            Y_data, 
                                                            batch_size=128, 
                                                            epochs=1) 

# print model
print_model(Embed_LSTM_Dense_model, 'Embed_LSTM_Dense_model.png')

# save model
model_name = 'model_1.h5'

save_model_artifacts(model_name, Embed_LSTM_Dense_model)



**Model 2: Embedded - Conv - LSTM - Dense**

In [0]:
# Dependencies on inputs: Vocabulary size(from tokenizer), input sequence length
# tweakable factors: Output dim of embedding layer that determines the compactness of the embedding vector, Drop out rate

def Embed_Conv_LSTM_Dense(vocab_size, input_seq_length):
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, 
                             output_dim = 100, 
                             input_length=input_seq_length))
    model.add(Dropout(0.2))
    model.add(Conv1D(64, 5, activation='relu'))
    model.add(MaxPooling1D(pool_size=4))
    model.add(LSTM(100))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    return model
  
  

In [0]:
# run this after all the data is prepped

# Create
Embed_Conv_LSTM_Dense_model = Embed_Conv_LSTM_Dense(sizeof_vocab, X_data.shape[1])

# print summary
print(Embed_Conv_LSTM_Dense_model.summary())

# compile model
Embed_Conv_LSTM_Dense_model.compile(loss='categorical_crossentropy', 
                                    optimizer='adam', 
                                    metrics=['accuracy'])

# fit model
Embed_Conv_LSTM_Dense_model.fit(X_data, 
                                Y_data, 
                                batch_size=128, 
                                epochs=1) 

# print model
print_model(Embed_Conv_LSTM_Dense_model, 'Embed_Conv_LSTM_Dense_model.png')

# save model
model_name = 'model_2.h5'

save_model_artifacts(model_name, Embed_Conv_LSTM_Dense_model)



**Model 3: Embed - LSTM - Dense with only 4 layers**

Simple 4 layer model with input sequence = 0.5 of output dimentionality of embedding layer and output dimentionality of embedding layer = # units of LSTM layer that follows.

Lets implement callbacks here since we know this model works well with the previous data set. So with a bigger text, we would like to save our model along the way so that we have something to work with of the session crashes.

In [0]:
# Dependencies on inputs: Vocabulary size(from tokenizer), input sequence length
# tweakable factors: None as of now since this model converges well and gives impressive results
# number at the end indicates number of layers
# Observations: # dims for embed layer = # LSTM units = # Dense units - kept on purpose
# might consider experimenting later to see if performance is affected

def Embed_LSTM_Dense_4(vocab_size, input_seq_length):
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, 
                    output_dim = 100, 
                    input_length = input_seq_length))
    # input: (samples = 2338, features = 50)
    # output: (timesteps = 2338, samples = 50, features = 100)
    # added dropout when it performed bad with validation data
    model.add(LSTM(units = 100, 
                   dropout = 0.5,
                   recurrent_dropout = 0.2,
                   activation = 'tanh', 
                   use_bias = 'False'))
    # output: (None, 100) 
    model.add(Dense(100, activation='relu'))
    # output: (None, 100) 
    # following layer should always have # units = dict size so that it matches the output
    model.add(Dense(vocab_size, activation='softmax'))
    # output: (None, 856) 
    # Shape of labels is: (2338, 856)
    return model
  
 

In [0]:
# run this after all the data is prepped

# Create
Embed_LSTM_Dense_4_model = Embed_LSTM_Dense_4(sizeof_vocab, X_data.shape[1])

# print summary
print(Embed_LSTM_Dense_4_model.summary())

# compile model
Embed_LSTM_Dense_4_model.compile(loss='categorical_crossentropy', 
                                 optimizer='adam', 
                                 metrics=['accuracy'])

# adding a checkpoint| we monitor accuracy so the mode will be maximixed; for loss, its minimized
# we can leave it as auto but I'm explicitely mentioning as max
from keras.callbacks import ModelCheckpoint
model_savepoint = keras.callbacks.ModelCheckpoint(filepath='model_3_best_bread.h5', 
                                                  monitor='val_acc', 
                                                  verbose = 0 ,
                                                  save_best_only=True ,
                                                  save_weights_only=False, 
                                                  mode='max', 
                                                  period=1)


# fit model
Embed_LSTM_Dense_4_model_history = Embed_LSTM_Dense_4_model.fit(x = X_data, 
                                                                y = Y_data, 
                                                                callbacks = [model_savepoint],
                                                                batch_size=2, 
                                                                epochs=40) 

# plot
plot_acc_loss(Embed_LSTM_Dense_4_model_history)

# print model
#print_model(Embed_LSTM_Dense_4_model, 'Embed_LSTM_Dense_4_model.png')

# save model
# model name: model_3 is trainedon A piece of Bread
# commenting out below as implementing callbacks instead
# model_name = 'model_3_republic.h5'

# save_model_artifacts(model_name, Embed_LSTM_Dense_4_model)


# Data Pre-processing

In [0]:
# load document
# Variable doc is just a string: <class 'str'>

in_filename = "/content/gdrive/My Drive/DL/APieceOfBread.txt"
doc = load_doc(in_filename)
print(doc[:200])



A PIECE OF BREAD

BY FRANCOIS COPPEE


The young Duc de Hardimont happened to be at Aix in Savoy, whose waters he
hoped would benefit his famous mare, Perichole, who had become wind-broken
since the c


In [0]:
# clean document
# Variable token is a list of strings(words): <class 'list'>

tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))



['a', 'piece', 'of', 'bread', 'by', 'francois', 'coppee', 'the', 'young', 'duc', 'de', 'hardimont', 'happened', 'to', 'be', 'at', 'aix', 'in', 'savoy', 'whose', 'waters', 'he', 'hoped', 'would', 'benefit', 'his', 'famous', 'mare', 'perichole', 'who', 'had', 'become', 'windbroken', 'since', 'the', 'cold', 'she', 'had', 'caught', 'at', 'the', 'last', 'derby', 'and', 'was', 'finishing', 'his', 'breakfast', 'while', 'glancing', 'over', 'the', 'morning', 'paper', 'when', 'he', 'read', 'the', 'news', 'of', 'the', 'disastrous', 'engagement', 'at', 'reichshoffen', 'he', 'emptied', 'his', 'glass', 'of', 'chartreuse', 'laid', 'his', 'napkin', 'upon', 'the', 'restaurant', 'table', 'ordered', 'his', 'valet', 'to', 'pack', 'his', 'trunks', 'and', 'two', 'hours', 'later', 'took', 'the', 'express', 'to', 'paris', 'arriving', 'there', 'he', 'hastened', 'to', 'the', 'recruiting', 'office', 'and', 'enlisted', 'in', 'a', 'regiment', 'of', 'the', 'line', 'in', 'vain', 'had', 'he', 'led', 'the', 'enervatin

In [0]:
# create sequences from tokens
# doc_to_sequences is a list of sequences of 50 words: <class 'list'>

doc_to_sequences = list()
sequence_length = 50

doc_to_sequences = create_sequences(tokens,sequence_length)
print('Total Sequences: %d' % len(doc_to_sequences))
print(doc_to_sequences[:5])
print('\n'.join(doc_to_sequences[:5]))



Input sequence length is 50
Total Sequences: 2338
['a piece of bread by francois coppee the young duc de hardimont happened to be at aix in savoy whose waters he hoped would benefit his famous mare perichole who had become windbroken since the cold she had caught at the last derby and was finishing his breakfast while glancing over', 'piece of bread by francois coppee the young duc de hardimont happened to be at aix in savoy whose waters he hoped would benefit his famous mare perichole who had become windbroken since the cold she had caught at the last derby and was finishing his breakfast while glancing over the', 'of bread by francois coppee the young duc de hardimont happened to be at aix in savoy whose waters he hoped would benefit his famous mare perichole who had become windbroken since the cold she had caught at the last derby and was finishing his breakfast while glancing over the morning', 'bread by francois coppee the young duc de hardimont happened to be at aix in savoy whos

In [0]:
# save sequences to file to be used later for prediction

out_filename = 'APieceOfBread_sequences.txt'
save_doc(doc_to_sequences, out_filename)



In [0]:
# load saved text sequences. do this step if you already have the sequences created earlier
# doc_lines_seq is a list of sequences of 50 words: <class 'list'>, each elemnt within that list is a string
# print(type(doc_lines_seq)) : <class 'list'>
# print(type(doc_lines_seq[0])) : <class 'str'>

in_filename = 'APieceOfBread_sequences.txt'
doc_sequences = load_sequenced_doc(in_filename)
# load it into a list for processing. Splitting into elemennts by newline
doc_lines_seq = doc_sequences.split('\n')
print('Before tokenizing, sample length of input string sequence %d' % len(doc_lines_seq[0]))
print(doc_lines_seq[:5])



Before tokenizing, sample length of input string sequence 281
['a piece of bread by francois coppee the young duc de hardimont happened to be at aix in savoy whose waters he hoped would benefit his famous mare perichole who had become windbroken since the cold she had caught at the last derby and was finishing his breakfast while glancing over', 'piece of bread by francois coppee the young duc de hardimont happened to be at aix in savoy whose waters he hoped would benefit his famous mare perichole who had become windbroken since the cold she had caught at the last derby and was finishing his breakfast while glancing over the', 'of bread by francois coppee the young duc de hardimont happened to be at aix in savoy whose waters he hoped would benefit his famous mare perichole who had become windbroken since the cold she had caught at the last derby and was finishing his breakfast while glancing over the morning', 'bread by francois coppee the young duc de hardimont happened to be at aix i

In [0]:
# Vectorize the sequences using tokenizer. A word to int mapping dictionary will be created
# doc_lines_int_seq is a list of lists. Each sequence is a list
# print(type(doc_lines_int_seq)) : <class 'list'>
# print(type(doc_lines_int_seq[0])) : <class 'list'>

tokenizer_name = 'tokenizer.pkl'
doc_lines_int_seq, sizeof_vocab = text_to_int_tokenize(doc_lines_seq, tokenizer_name)

print('After tokenizing/vectorizing, sample length of input integers sequence %d' % len(doc_lines_int_seq[0]))
print('Size of the vocabulary build by the tokenizer %d' % sizeof_vocab)

print(doc_lines_int_seq[:5])



After tokenizing/vectorizing, sample length of input integers sequence 51
Size of the vocabulary build by the tokenizer 856
[[4, 115, 3, 29, 21, 853, 852, 1, 38, 114, 28, 33, 848, 7, 51, 17, 847, 9, 846, 155, 845, 11, 844, 32, 842, 6, 841, 840, 839, 58, 14, 266, 836, 835, 1, 834, 79, 14, 833, 17, 1, 109, 831, 2, 8, 829, 6, 828, 826, 825, 59], [115, 3, 29, 21, 853, 852, 1, 38, 114, 28, 33, 848, 7, 51, 17, 847, 9, 846, 155, 845, 11, 844, 32, 842, 6, 841, 840, 839, 58, 14, 266, 836, 835, 1, 834, 79, 14, 833, 17, 1, 109, 831, 2, 8, 829, 6, 828, 826, 825, 59, 1], [3, 29, 21, 853, 852, 1, 38, 114, 28, 33, 848, 7, 51, 17, 847, 9, 846, 155, 845, 11, 844, 32, 842, 6, 841, 840, 839, 58, 14, 266, 836, 835, 1, 834, 79, 14, 833, 17, 1, 109, 831, 2, 8, 829, 6, 828, 826, 825, 59, 1, 156], [29, 21, 853, 852, 1, 38, 114, 28, 33, 848, 7, 51, 17, 847, 9, 846, 155, 845, 11, 844, 32, 842, 6, 841, 840, 839, 58, 14, 266, 836, 835, 1, 834, 79, 14, 833, 17, 1, 109, 831, 2, 8, 829, 6, 828, 826, 825, 59, 1, 156,

In [0]:
# create data
# X_data is 2D tensor (numpy array since data type is same) ; Y_data is 2D tensor
# X_data is <class 'numpy.ndarray'> with dimensions # samples, features = words in sequence
# Y_data is <class 'numpy.ndarray'> with dimension # samples, features = vocab size - sparse array

X_data, Y_data = create_inputs_and_labels(doc_lines_int_seq, sizeof_vocab)

print("Shape of inputs is: {0}".format(X_data.shape))
print("Shape of labels is: {0}".format(Y_data.shape))



Shape of inputs is: (2338, 50)
Shape of labels is: (2338, 856)


In [0]:
print(type(Y_data))
print(type(Y_data[0]))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [0]:
# Better to also have test, validation, and training sets for rigorous testing
from math import floor

ratio_train = 0.65
ratio_valid = 0.35

num_observations = len(X_data)

upper_idx_test = floor(num_observations*ratio_train)
upper_idx_valid = upper_idx_test + floor(num_observations*ratio_valid)

# seperate out the test

X_train = X_data[:upper_idx_test]
Y_train = Y_data[:upper_idx_test]

print("Training Input shape {0} ; Output shape {1}".format(X_train.shape, Y_train.shape))

X_validation = X_data[upper_idx_test:upper_idx_valid]
Y_validation = Y_data[upper_idx_test:upper_idx_valid]

print("Validation Input shape {0} ; Output shape {1}".format(X_validation.shape, Y_validation.shape))

X_test = X_data[upper_idx_valid:]

# since we need actual tokenized ints and not categorical ints
array_doc_lines_int_seq = array(doc_lines_int_seq)
next_in_sequence_actual_int = array_doc_lines_int_seq[:,-1]

Y_test = next_in_sequence_actual_int[upper_idx_valid:]

print("Test Input shape {0} ; Output shape {1}".format(X_test.shape, Y_test.shape))


Training Input shape (1519, 50) ; Output shape (1519, 856)
Validation Input shape (818, 50) ; Output shape (818, 856)
Test Input shape (1, 50) ; Output shape (1,)


In [0]:
# compact version for experimenting with text: republic.txt

in_filename = "/content/gdrive/My Drive/republic.txt"    # change this
doc = load_doc(in_filename)

tokens = clean_doc(doc)
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

doc_to_sequences = list()
sequence_length = 50

doc_to_sequences = create_sequences(tokens,sequence_length)
print('Total Sequences: %d' % len(doc_to_sequences))

out_filename = 'republic_sequences.txt'      # change this
save_doc(doc_to_sequences, out_filename)

in_filename = 'republic_sequences.txt'      # change this
doc_sequences = load_sequenced_doc(in_filename)
# load it into a list for processing. Splitting into elemennts by newline
doc_lines_seq = doc_sequences.split('\n')
print('Before tokenizing, sample length of input string sequence %d' % len(doc_lines_seq[0]))

tokenizer_name = 'tokenizer_republic.pkl'      # change this
doc_lines_int_seq, sizeof_vocab = text_to_int_tokenize(doc_lines_seq, tokenizer_name)

print('After tokenizing/vectorizing, sample length of input integers sequence %d' % len(doc_lines_int_seq[0]))
print('Size of the vocabulary build by the tokenizer %d' % sizeof_vocab)

X_data, Y_data = create_inputs_and_labels(doc_lines_int_seq, sizeof_vocab)

print("Shape of inputs is: {0}".format(X_data.shape))
print("Shape of labels is: {0}".format(Y_data.shape))

"""
# use this for dividing the datset into test, train, validation dataset
ratio_train = 0.65
ratio_valid = 0.35

num_observations = len(X_data)

upper_idx_test = floor(num_observations*ratio_train)
upper_idx_valid = upper_idx_test + floor(num_observations*ratio_valid)

# seperate out the test

X_train = X_data[:upper_idx_test]
Y_train = Y_data[:upper_idx_test]

print("Training Input shape {0} ; Output shape {1}".format(X_train.shape, Y_train.shape))

X_validation = X_data[upper_idx_test:upper_idx_valid]
Y_validation = Y_data[upper_idx_test:upper_idx_valid]

print("Validation Input shape {0} ; Output shape {1}".format(X_validation.shape, Y_validation.shape))

X_test = X_data[upper_idx_valid:]

# since we need actual tokenized ints and not categorical ints
array_doc_lines_int_seq = array(doc_lines_int_seq)
next_in_sequence_actual_int = array_doc_lines_int_seq[:,-1]

Y_test = next_in_sequence_actual_int[upper_idx_valid:]

print("Test Input shape {0} ; Output shape {1}".format(X_test.shape, Y_test.shape))

"""



# Predicting the next word - Text Generation

In [0]:
# Model: Model 1

in_filename = 'APieceOfBread_sequences.txt'     # we use the same file
model_name = 'model_1.h5'
tokenizer_name = 'tokenizer.pkl'

seed_text, generated_text = predict_from_seed_data(in_filename, model_name, tokenizer_name)

print(seed_text)
print(generated_text)



the morning paper when he read the news of the disastrous engagement at reichshoffen he emptied his glass of chartreuse laid his napkin upon the restaurant table ordered his valet to pack his trunks and two hours later took the express to paris arriving there he hastened to the recruiting office

the morning paper when he read the news of the disastrous engagement at reichshoffen he emptied his glass of chartreuse laid his napkin upon the restaurant table ordered his valet to pack his trunks and two hours later took the express to paris arriving there he hastened to the recruiting office
the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the


In [0]:
# Model: Model 2

in_filename = 'APieceOfBread_sequences.txt'     # we use the same file
model_name = 'model_2.h5'
tokenizer_name = 'tokenizer.pkl'
seed_text, generated_text = predict_from_seed_data(in_filename, model_name, tokenizer_name)

print(seed_text)
print(generated_text)



the bread was hard and had a bitter taste no fresh would be given until the next mornings distribution so the commissary officer had willed it this was certainly a very hard life sometimes the remembrance of former breakfasts came to him such as he had called hygienic when the day

the bread was hard and had a bitter taste no fresh would be given until the next mornings distribution so the commissary officer had willed it this was certainly a very hard life sometimes the remembrance of former breakfasts came to him such as he had called hygienic when the day
it it it it it it it it it it it it it it it it it it it the the the the the the the the the the the the the the the the the the the it it it the the it it it it it it it


In [0]:
# Model: Model 3

in_filename = 'APieceOfBread_sequences.txt'     # we use the same file
model_name = 'model_3_best_bread.h5'
tokenizer_name = 'tokenizer.pkl'
seed_text, generated_text = predict_from_seed_data(in_filename, model_name, tokenizer_name)

print("With random seed data....")
print(seed_text)
print(generated_text)


# with test data
print("With test data....")

next_in_sequence_int = Embed_LSTM_Dense_4_model.predict_classes(X_test)

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

next_in_sequence_predicted = list()

for word_int in next_in_sequence_int:
  for word, index in tokenizer.word_index.items():
    if index == word_int:
      out_word_1 = word
      break
  next_in_sequence_predicted.append(out_word_1)

next_in_sequence_actual = list()

for word_int in Y_test: 
  for word, index in tokenizer.word_index.items():
    if index == word_int:
      out_word_2 = word
      break
  next_in_sequence_actual.append(out_word_2)
  
print(next_in_sequence_predicted)
print(next_in_sequence_actual)

# with new sentence to check if it has learnt semantics of vocabulaty



it was my fault to think that the little boy was poor and unhappy I could not see the sad look the wicked woman used to give me as she handed me my bread unfortunately I was always hungry he emptied his glass while reading the newspaper he laid his napkin

With random seed data....
it was my fault to think that the little boy was poor and unhappy I could not see the sad look the wicked woman used to give me as she handed me my bread unfortunately I was always hungry he emptied his glass while reading the newspaper he laid his napkin
on the pockets of his red trousers and shivering in his sheepskin coat he gave himself up to a sombre thoughts this defeated soldier and looked with a wretched winter sky across in one shudder the roof lighting in the ground floor of the cafeanglais and was a large young
With test data....
['offends']
['offends']


In [0]:
# Model: Model 3
# Text: republic.txt

in_filename = 'republic_sequences.txt'     # we use the same file
model_name = 'model_3_best_republic.h5'
tokenizer_name = 'tokenizer_republic.pkl'
seed_text, generated_text = predict_from_seed_data(in_filename, model_name, tokenizer_name)

print(seed_text)
print(generated_text)



# Understanding Individual Layers

In [0]:
# Experiment 1: working of enbedding layer

model = Sequential()
model.add(Embedding(input_dim = 856, 
                    output_dim = 100, 
                    input_length=50))

model.summary()

model.compile(optimizer = 'rmsprop',
              loss = 'mse')

# no fit since embedding only maps words to reduce dimensionality fo the vocabulary
# creates a dense vector from a sparse vocabulary

embedding_layer_output = model.predict(X_data)

print("Input shape: {0} Input type {1} transformed to ".format(X_data.shape, type(X_data)))
print("Output shape {0}  Output type {1}".format(embedding_layer_output.shape, type(embedding_layer_output)))



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 50, 100)           85600     
Total params: 85,600
Trainable params: 85,600
Non-trainable params: 0
_________________________________________________________________


In [0]:
glove_doc = load_doc('glove.6B.100d.txt')

In [0]:
type(lines)
lines[:400]

In [0]:
for line in glove_doc:
  lines.append(line)

In [0]:
glove_dir = '/Users/mrinalrawool/Downloads/glove.6B'
import os
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))

FileNotFoundError: ignored

In [0]:
# using Glove 6B


# glove_dir = '/Users/mrinalrawool/Downloads/glove.6B'
embeddings_index = {}
# f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
glove_doc = load_doc('glove.6B.100d.txt')
print(f.shape)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
# f.close()
print('Found %s word vectors.' % len(embeddings_index))

embedding_dim = 100
max_words = vocab_size # (tokenizer.word_index + 1)

# for every word in the tokenizer word index, if the word exists in embedding_index; 
# load the co-effs in embedding_dim
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

In [0]:
# Experiment 2: Undertanding LSTM and using it in a small architecture.

# =========================================   Preparing test sets   =========================================

X_train = X_data[:1554]
Y_train = Y_data[:1554]

print("Training Input shape {0} ; Output shape {1}".format(X_train.shape, Y_train.shape))

X_validation = X_data[1554:2330]
Y_validation = Y_data[1554:2330]

print("Validation Input shape {0} ; Output shape {1}".format(X_validation.shape, Y_validation.shape))

X_test = X_data[2330:]

# since we need actual tokenized ints and not categorical ints
array_doc_lines_int_seq = array(doc_lines_int_seq)
next_in_sequence_actual_int = array_doc_lines_int_seq[:,-1]
Y_test = next_in_sequence_actual_int[2330:]

print("Test Input shape {0} ; Output shape {1}".format(X_test.shape, Y_test.shape))


# =========================================  The model   =========================================
# Working of LSTM
# while calculating the transformations, using None instead of #samples works too

# vocab size = 856

model = Sequential()
model.add(Embedding(input_dim = 856, 
                    output_dim = 100, 
                    input_length = 50))
# input: (samples = 2338, features = 50)
# output: (timesteps = 2338, samples = 50, features = 100)
model.add(LSTM(units = 100, 
               activation = 'tanh', 
               use_bias = 'False'))
# output: (None, 100) 
model.add(Dense(100, activation='relu'))
# output: (None, 100) 
# following layer should always have # units = dict size so that it matches the output
model.add(Dense(856, activation='softmax'))
# output: (None, 856) 
# Shape of labels is: (2338, 856)
model.summary()


model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

#x = X_data, y = Y_data, 
training_history = model.fit(x = X_data, 
                             y = Y_data, 
                             batch_size = 20, 
                             epochs = 34,
                             validation_data=(X_validation, Y_validation))

# =========================================  Testing and Evaluation  =========================================

plot_acc_loss(training_history)

next_in_sequence_int = model.predict_classes(X_test) # note we use predict classes

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

next_in_sequence_predicted = list()
next_in_sequence_actual = list()

for word_int in next_in_sequence_int:
  for word, index in tokenizer.word_index.items():
    if index == word_int:
      out_word_1 = word
      break
  next_in_sequence_predicted.append(out_word_1)

for word_int in Y_test: 
  for word, index in tokenizer.word_index.items():
    if index == word_int:
      out_word_2 = word
      break
  next_in_sequence_actual.append(out_word_2)
  
print(next_in_sequence_predicted)
print(next_in_sequence_actual)


# Observations

"""

This was a network with one LSTM layers with # units = output dimensionality of the embedding layer. 
The model converges fairly quickly (14 sec to 10 sec).
Furthermore, the model quickly learnt sequences with validation loss at 90%.
The model predicted 7/8 words correctly.

Possible experiments include
1. Hiding a complete sentence from the model before it is tokenized and feeding it as test data.
2. Reducing output dimensionality of embedding layer while keeping # LSTM units constant.
3. Using Glove embeddings.
4. Repeating the experiments with different larger texts.

Epoch 34/34
2338/2338 [==============================] - 10s 4ms/step - loss: 0.6610 - acc: 0.8618 - val_loss: 0.5331 - val_acc: 0.9008


"""

In [0]:
# Experiment 2: Undertanding LSTM and using it in a small architecture.

# =========================================   Preparing test sets   =========================================

X_train = X_data[:1554]
Y_train = Y_data[:1554]

print("Training Input shape {0} ; Output shape {1}".format(X_train.shape, Y_train.shape))

X_validation = X_data[1554:2330]
Y_validation = Y_data[1554:2330]

print("Validation Input shape {0} ; Output shape {1}".format(X_validation.shape, Y_validation.shape))

X_test = X_data[2330:]

# since we need actual tokenized ints and not categorical ints
array_doc_lines_int_seq = array(doc_lines_int_seq)
next_in_sequence_actual_int = array_doc_lines_int_seq[:,-1]
Y_test = next_in_sequence_actual_int[2330:]

print("Test Input shape {0} ; Output shape {1}".format(X_test.shape, Y_test.shape))


# =========================================  The model   =========================================
# Working of LSTM
# while calculating the transformations, using None instead of #samples works too

# vocab size = 856

model = Sequential()
model.add(Embedding(input_dim = 856, 
                    output_dim = 100, 
                    input_length = 50))
# input: (samples = 2338, features = 50)
# output: (timesteps = 2338, samples = 50, features = 100)
model.add(LSTM(20, return_sequences=True))
model.add(LSTM(10))
# output: (None, 100) 
model.add(Dense(100, activation='relu'))
# output: (None, 100) 
# following layer should always have # units = dict size so that it matches the output
model.add(Dense(856, activation='softmax'))
# output: (None, 856) 
# Shape of labels is: (2338, 856)
model.summary()

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

training_history = model.fit(x = X_data, 
                             y = Y_data, 
                             batch_size = 20, 
                             epochs = 34,
                             validation_data=(X_validation, Y_validation))

# =========================================  Testing and Evaluation  =========================================

plot_acc_loss(training_history)

next_in_sequence_int = model.predict_classes(X_test) # note we use predict classes

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

next_in_sequence_predicted = list()
next_in_sequence_actual = list()

for word_int in next_in_sequence_int:
  for word, index in tokenizer.word_index.items():
    if index == word_int:
      out_word_1 = word
      break
  next_in_sequence_predicted.append(out_word_1)

for word_int in Y_test: 
  for word, index in tokenizer.word_index.items():
    if index == word_int:
      out_word_2 = word
      break
  next_in_sequence_actual.append(out_word_2)
  
print(next_in_sequence_predicted)
print(next_in_sequence_actual)

# Observations

"""

This was a deeper network with two LSTM layers with fewer units. 
The model took a long time per epoch (23 sec to 19 sec).
Furthermore, the model did not learn anything valuable as the training accuracy hoveered around 21%.
This led to the predictions being completely incorrect

Epoch 34/34
2338/2338 [==============================] - 19s 8ms/step - loss: 3.2064 - acc: 0.2126 - val_loss: 3.0521 - val_acc: 0.2487


"""

In [0]:
# Experiment 3: Undertanding LSTM and using it in a small architecture.

# =========================================   Preparing test sets   =========================================

X_train = X_data[:1554]
Y_train = Y_data[:1554]

print("Training Input shape {0} ; Output shape {1}".format(X_train.shape, Y_train.shape))

X_validation = X_data[1554:2330]
Y_validation = Y_data[1554:2330]

print("Validation Input shape {0} ; Output shape {1}".format(X_validation.shape, Y_validation.shape))

X_test = X_data[2330:]

# since we need actual tokenized ints and not categorical ints
array_doc_lines_int_seq = array(doc_lines_int_seq)
next_in_sequence_actual_int = array_doc_lines_int_seq[:,-1]
Y_test = next_in_sequence_actual_int[2330:]

print("Test Input shape {0} ; Output shape {1}".format(X_test.shape, Y_test.shape))


# =========================================  The model   =========================================
# Working of LSTM
# while calculating the transformations, using None instead of #samples works too

# vocab size = 856

model = Sequential()
model.add(Embedding(input_dim = 856, 
                    output_dim = 50, 
                    input_length = 50))
# input: (samples = 2338, features = 50)
# output: (timesteps = 2338, samples = 50, features = 100)
model.add(LSTM(units = 100, 
               activation = 'tanh', 
               use_bias = 'False'))
# output: (None, 100) 
model.add(Dense(100, activation='relu'))
# output: (None, 100) 
# following layer should always have # units = dict size so that it matches the output
model.add(Dense(856, activation='softmax'))
# output: (None, 856) 
# Shape of labels is: (2338, 856)
model.summary()


model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

training_history = model.fit(x = X_data, 
                             y = Y_data, 
                             batch_size = 20, 
                             epochs = 34,
                             validation_data=(X_validation, Y_validation))

# =========================================  Testing and Evaluation  =========================================

plot_acc_loss(training_history)

next_in_sequence_int = model.predict_classes(X_test) # note we use predict classes

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

next_in_sequence_predicted = list()
next_in_sequence_actual = list()

for word_int in next_in_sequence_int:
  for word, index in tokenizer.word_index.items():
    if index == word_int:
      out_word_1 = word
      break
  next_in_sequence_predicted.append(out_word_1)

for word_int in Y_test: 
  for word, index in tokenizer.word_index.items():
    if index == word_int:
      out_word_2 = word
      break
  next_in_sequence_actual.append(out_word_2)
  
print(next_in_sequence_predicted)
print(next_in_sequence_actual)


# Observations

"""

This was a network with one LSTM layers with # units > output dimensionality of the embedding layer. 
The model converges fairly quickly (15 sec to 10 sec).
Furthermore, the model quickly learnt sequences with validation loss at 66%.
The model predicted all words correctly bu that might just be co-incidence.

Epoch 34/34
2338/2338 [==============================] - 11s 5ms/step - loss: 1.6142 - acc: 0.5774 - val_loss: 1.4094 - val_acc: 0.6649


"""

In [0]:
# Experiment 4: Undertanding LSTM and using it in a small architecture.

# =========================================   Preparing test sets   =========================================

X_train = X_data[:1554]
Y_train = Y_data[:1554]

print("Training Input shape {0} ; Output shape {1}".format(X_train.shape, Y_train.shape))

X_validation = X_data[1554:2330]
Y_validation = Y_data[1554:2330]

print("Validation Input shape {0} ; Output shape {1}".format(X_validation.shape, Y_validation.shape))

X_test = X_data[2330:]

# since we need actual tokenized ints and not categorical ints
array_doc_lines_int_seq = array(doc_lines_int_seq)
next_in_sequence_actual_int = array_doc_lines_int_seq[:,-1]
Y_test = next_in_sequence_actual_int[2330:]

print("Test Input shape {0} ; Output shape {1}".format(X_test.shape, Y_test.shape))


# =========================================  The model   =========================================
# Working of LSTM
# while calculating the transformations, using None instead of #samples works too

# vocab size = 856

model = Sequential()
model.add(Embedding(input_dim = 856, 
                    output_dim = 50, 
                    input_length = 50))
# input: (samples = 2338, features = 50)
# output: (timesteps = 2338, samples = 50, features = 100)
model.add(LSTM(units = 50, 
               activation = 'tanh', 
               use_bias = 'False'))
# output: (None, 100) 
model.add(Dense(100, activation='relu'))
# output: (None, 100) 
# following layer should always have # units = dict size so that it matches the output
model.add(Dense(856, activation='softmax'))
# output: (None, 856) 
# Shape of labels is: (2338, 856)
model.summary()


model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

training_history = model.fit(x = X_data, 
                             y = Y_data, 
                             batch_size = 20, 
                             epochs = 34,
                             validation_data=(X_validation, Y_validation))

# =========================================  Testing and Evaluation  =========================================

plot_acc_loss(training_history)

next_in_sequence_int = model.predict_classes(X_test) # note we use predict classes

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

next_in_sequence_predicted = list()
next_in_sequence_actual = list()

for word_int in next_in_sequence_int:
  for word, index in tokenizer.word_index.items():
    if index == word_int:
      out_word_1 = word
      break
  next_in_sequence_predicted.append(out_word_1)

for word_int in Y_test: 
  for word, index in tokenizer.word_index.items():
    if index == word_int:
      out_word_2 = word
      break
  next_in_sequence_actual.append(out_word_2)
  
print(next_in_sequence_predicted)
print(next_in_sequence_actual)


# Observations

"""

This was a network with one LSTM layers with # units = output dimensionality of the embedding layer. Both reduced to 50
The model converges fairly quickly (15 sec to 10 sec).
Furthermore, the model quickly learnt sequences with validation loss at 66%.
The model predicted 6/8 words correctly.

Epoch 34/34
2338/2338 [==============================] - 10s 4ms/step - loss: 1.5056 - acc: 0.6322 - val_loss: 1.3629 - val_acc: 0.6765


"""