In [24]:
import string
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

Using TensorFlow backend.


In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# turn a doc into clean tokens
def clean_doc(doc):
    # replace '--' with a space ' '
    doc = doc.replace('--', ' ')
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens
 
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [3]:
# load document
in_filename = 'republic_clean.txt'
doc = load_doc(in_filename)
print(doc[:200])

I went down yesterday to the Piraeus with Glaucon the son of Ariston,
that I might offer up my prayers to the goddess (Bendis, the Thracian
Artemis.); and also because I wanted to see in what manner t


In [10]:
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('\n Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['i', 'went', 'down', 'yesterday', 'to', 'the', 'piraeus', 'with', 'glaucon', 'the', 'son', 'of', 'ariston', 'that', 'i', 'might', 'offer', 'up', 'my', 'prayers', 'to', 'the', 'goddess', 'bendis', 'the', 'thracian', 'artemis', 'and', 'also', 'because', 'i', 'wanted', 'to', 'see', 'in', 'what', 'manner', 'they', 'would', 'celebrate', 'the', 'festival', 'which', 'was', 'a', 'new', 'thing', 'i', 'was', 'delighted', 'with', 'the', 'procession', 'of', 'the', 'inhabitants', 'but', 'that', 'of', 'the', 'thracians', 'was', 'equally', 'if', 'not', 'more', 'beautiful', 'when', 'we', 'had', 'finished', 'our', 'prayers', 'and', 'viewed', 'the', 'spectacle', 'we', 'turned', 'in', 'the', 'direction', 'of', 'the', 'city', 'and', 'at', 'that', 'instant', 'polemarchus', 'the', 'son', 'of', 'cephalus', 'chanced', 'to', 'catch', 'sight', 'of', 'us', 'from', 'a', 'distance', 'as', 'we', 'were', 'starting', 'on', 'our', 'way', 'home', 'and', 'told', 'his', 'servant', 'to', 'run', 'and', 'bid', 'us', 'wait'

In [14]:
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 118631


In [17]:
sequences[:5]

['i went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was delighted with',
 'went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was delighted with the',
 'down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was delighted with the procession',
 'yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see 

In [18]:
# save sequences to file
out_filename = 'republic_sequences.txt'
save_doc(sequences, out_filename)

In [19]:
# load
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [22]:
lines[:5]

['i went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was delighted with',
 'went down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was delighted with the',
 'down yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see in what manner they would celebrate the festival which was a new thing i was delighted with the procession',
 'yesterday to the piraeus with glaucon the son of ariston that i might offer up my prayers to the goddess bendis the thracian artemis and also because i wanted to see 

In [25]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [28]:
sequences[:2]

[[11,
  1045,
  329,
  7409,
  4,
  1,
  2873,
  35,
  213,
  1,
  261,
  3,
  2251,
  9,
  11,
  179,
  817,
  123,
  92,
  2872,
  4,
  1,
  2250,
  7408,
  1,
  7407,
  7406,
  2,
  75,
  120,
  11,
  1266,
  4,
  110,
  6,
  30,
  168,
  16,
  49,
  7405,
  1,
  1609,
  13,
  57,
  8,
  549,
  151,
  11,
  57,
  1265,
  35],
 [1045,
  329,
  7409,
  4,
  1,
  2873,
  35,
  213,
  1,
  261,
  3,
  2251,
  9,
  11,
  179,
  817,
  123,
  92,
  2872,
  4,
  1,
  2250,
  7408,
  1,
  7407,
  7406,
  2,
  75,
  120,
  11,
  1266,
  4,
  110,
  6,
  30,
  168,
  16,
  49,
  7405,
  1,
  1609,
  13,
  57,
  8,
  549,
  151,
  11,
  57,
  1265,
  35,
  1]]

In [29]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'of': 3,
 'to': 4,
 'is': 5,
 'in': 6,
 'he': 7,
 'a': 8,
 'that': 9,
 'be': 10,
 'i': 11,
 'not': 12,
 'which': 13,
 'are': 14,
 'you': 15,
 'they': 16,
 'or': 17,
 'will': 18,
 'said': 19,
 'as': 20,
 'we': 21,
 'but': 22,
 'have': 23,
 'them': 24,
 'his': 25,
 'for': 26,
 'by': 27,
 'who': 28,
 'their': 29,
 'what': 30,
 'then': 31,
 'this': 32,
 'one': 33,
 'if': 34,
 'with': 35,
 'there': 36,
 'all': 37,
 'true': 38,
 'at': 39,
 'when': 40,
 'do': 41,
 'other': 42,
 'has': 43,
 'yes': 44,
 'any': 45,
 'him': 46,
 'no': 47,
 'good': 48,
 'would': 49,
 'may': 50,
 'state': 51,
 'from': 52,
 'man': 53,
 'say': 54,
 'our': 55,
 'only': 56,
 'was': 57,
 'an': 58,
 'must': 59,
 'should': 60,
 'so': 61,
 'more': 62,
 'us': 63,
 'can': 64,
 'on': 65,
 'were': 66,
 'very': 67,
 'now': 68,
 'like': 69,
 'such': 70,
 'replied': 71,
 'just': 72,
 'certainly': 73,
 'than': 74,
 'also': 75,
 'these': 76,
 'men': 77,
 'same': 78,
 'another': 79,
 'about': 80,
 'justice': 8

In [30]:
len(tokenizer.word_index)

7409

In [31]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [33]:
# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]

In [35]:
X

array([[  11, 1045,  329, ...,   11,   57, 1265],
       [1045,  329, 7409, ...,   57, 1265,   35],
       [ 329, 7409,    4, ..., 1265,   35,    1],
       ...,
       [ 382,  467,    4, ..., 1044,  414,   13],
       [ 467,    4,   33, ...,  414,   13,   21],
       [   4,   33,   79, ...,   13,   21,   23]])

In [37]:
X.shape

(118631, 50)

In [36]:
y

array([  35,    1, 2874, ...,   21,   23,   85])

In [38]:
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [39]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [40]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            370500    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 7410)              748410    
Total params: 1,269,810
Trainable params: 1,269,810
Non-trainable params: 0
_________________________________________________________________
None


In [41]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1079e5438>

In [43]:
# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

# Use Language Model

In [49]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [42]:
# load cleaned text sequences
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [None]:
# We need the text so that we can choose a source sequence as input to the model for generating a new sequence 
# of text.

In [44]:
seq_length = len(lines[0].split()) - 1

In [45]:
seq_length

50

In [50]:
# load the model
model = load_model('model.h5')

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

In [51]:
# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

very true he replied yet having begun we must go forward to the rough places of the law at the same time begging of these gentlemen for once in their life to be serious not long ago as we shall remind them the hellenes were of the opinion which is still



In [52]:
# encoded = tokenizer.texts_to_sequences([seed_text])[0]

In [67]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)
 
generated = generate_seq(model, tokenizer, seq_length, seed_text,40)
print(generated)

and the state of the state of the state of the state of the state of the state of the state of the state of the state of the state of the state of the state of the state of
