In [90]:
import warnings
warnings.filterwarnings('ignore')

In [91]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
 

## Load Text

In [61]:
#Load doc into memory
def load_doc(filename):
    with open(filename,'r') as f:
        text = f.read()
    return text
        

In [62]:
# load document
in_filename = '../dataset/republic_clean.txt'
doc = load_doc(in_filename)
print(doc[:200])

BOOK I.

I went down yesterday to the Piraeus with Glaucon the son of Ariston,
that I might offer up my prayers to the goddess (Bendis, the Thracian
Artemis.); and also because I wanted to see in what


In [63]:
import string
#turn a doc into clean tokens
def clean_doc(doc):
    #Replace '--' with ' '
    doc = doc.replace('--',' ')
    #split into tokens by space
    tokens = doc.split()
    #remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

In [64]:
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['book', 'i', 'i', 'went', 'down', 'yesterday', 'to', 'the', 'piraeus', 'with', 'glaucon', 'the', 'son', 'of', 'ariston', 'that', 'i', 'might', 'offer', 'up', 'my', 'prayers', 'to', 'the', 'goddess', 'bendis', 'the', 'thracian', 'artemis', 'and', 'also', 'because', 'i', 'wanted', 'to', 'see', 'in', 'what', 'manner', 'they', 'would', 'celebrate', 'the', 'festival', 'which', 'was', 'a', 'new', 'thing', 'i', 'was', 'delighted', 'with', 'the', 'procession', 'of', 'the', 'inhabitants', 'but', 'that', 'of', 'the', 'thracians', 'was', 'equally', 'if', 'not', 'more', 'beautiful', 'when', 'we', 'had', 'finished', 'our', 'prayers', 'and', 'viewed', 'the', 'spectacle', 'we', 'turned', 'in', 'the', 'direction', 'of', 'the', 'city', 'and', 'at', 'that', 'instant', 'polemarchus', 'the', 'son', 'of', 'cephalus', 'chanced', 'to', 'catch', 'sight', 'of', 'us', 'from', 'a', 'distance', 'as', 'we', 'were', 'starting', 'on', 'our', 'way', 'home', 'and', 'told', 'his', 'servant', 'to', 'run', 'and', 'bid',

In [65]:
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
# select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 118633


In [66]:
sequences[0]

'book i i went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was'

In [67]:
sequences[1]

'i i went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was delighted'

In [68]:
#save tokens to file, one dialague per line
def save_doc(lines,filename):
    with open(filename,'w') as f:
        f.write("\n".join(lines))
    

In [69]:
# save sequences to file
out_filename = '../dataset/republic_sequences.txt'
save_doc(sequences, out_filename)

In [70]:
#Load sequence

in_filename = '../dataset/republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [71]:
lines[0]

'book i i went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was'

In [72]:
#Encode sequence
tokenizer = Tokenizer()
#First, the Tokenizer must be trained on the entire training dataset, which means it 
#finds all of the unique words in the data and assigns each a unique integer.
tokenizer.fit_on_texts(lines)

sequences = tokenizer.texts_to_sequences(lines)

In [73]:
#We can access the mapping of words to integers as a dictionary attribute called word_index on the Tokenizer object.
#We need to know the size of the vocabulary for defining the embedding layer later. 
#We can determine the vocabulary by calculating the size of the mapping dictionary.

# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [74]:
vocab_size

7410

### Sequence Inputs and Output

In [75]:
#Now that we have encoded the input sequences, we need to separate them into input (X) and output (y) elements.
sequences = array(sequences)
X,y = sequences[:,:-1],sequences[:,-1]

In [76]:
#need to create one hot representation as we need to train.
#y is having 1 for corresponding word & 0 for the remaining entries.
#it's of vocabulary size.
y = to_categorical(y, num_classes=vocab_size)

In [77]:
y.shape

(118633, 7410)

### Fit model

In [78]:
seq_length = X.shape[1]
seq_length

50

In [79]:
#define model
model = Sequential()
model.add(Embedding(vocab_size,50,input_length=seq_length)) #we are learning 50 dimension embedding vector.
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100,activation='relu'))
model.add(Dense(vocab_size,activation='softmax'))
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 50)            370500    
_________________________________________________________________
lstm_3 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 7410)              748410    
Total params: 1,269,810
Trainable params: 1,269,810
Non-trainable params: 0
_________________________________________________________________
None


In [81]:
#compile model
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

#fit model
#model.fit(X,y,batch_size=128,epochs=100)


In [92]:
#Model has been trained on colab 

from random import randint
from pickle import load
#from keras.models import load_model
import tensorflow.keras as keras
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences


# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)

# load cleaned text sequences
in_filename = '../deep learning/republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1



# load the tokenizer
tokenizer = load(open('../deep learning/tokenizer.pkl', 'rb'))

# load the model
model = load_model('../deep learning/word_prediction_RNN.h5')

# select a seed text
#seed_text = lines[randint(0,len(lines))]
#print(seed_text + '\n')

# generate new text
#generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
#print(generated)


TypeError: __init__() got an unexpected keyword argument 'ragged'

In [93]:
# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

a step backward for we have gone wrong in the order of the sciences what was the mistake he said after plane geometry i said we proceeded at once to solids in revolution instead of taking solids in themselves whereas after the second dimension the third which is concerned with cubes



In [94]:
result = list()
in_text = seed_text