In [1]:
import nltk
import numpy as np
import os
import random
import sys

from keras.callbacks import LambdaCallback
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop

In [14]:
nltk.download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\piyus\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\piyus\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     C:\Users\piyus\AppData\Roaming\nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\piyus\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     C:\Users\piyus\AppData\Roaming\nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     C:\Users\piyus\AppData\R

True

In [2]:
database_dir = "C:/Users/piyus/AppData/Roaming/nltk_data/corpora/state_union"

file_list = []
for root, _ , files in os.walk(database_dir):  
    for filename in files:
        file_list.append(os.path.join(root, filename))
        
print("Read ", len(file_list), " files..." )

docs = []

for files in file_list:
    with open(files, 'r') as fin:
        try:
            str_form = fin.read().lower().replace('\n', '')
            docs.append(str_form)
        except UnicodeDecodeError: 
            pass
text = ' '.join(docs)

print('corpus length:', len(text))

Read  66  files...
corpus length: 2066704


In [3]:
text[:1000]

"president harry s. truman's address before a joint session of the congress april 16, 1945mr. speaker, mr. president, members of the congress:it is with a heavy heart that i stand before you, my friends and colleagues, in the congress of the united states.only yesterday, we laid to rest the mortal remains of our beloved president, franklin delano roosevelt. at a time like this, words are inadequate. the most eloquent tribute would be a reverent silence.yet, in this decisive hour, when world events are moving so rapidly, our silence might be misunderstood and might give comfort to our enemies.in his infinite wisdom, almighty god has seen fit to take from us a great man who loved, and was beloved by, all humanity.no man could possibly fill the tremendous void left by the passing of that noble soul. no words can ease the aching hearts of untold millions of every race, creed and color. the world knows it has lost a heroic champion of justice and freedom.tragic fate has thrust upon us grave

In [4]:
chars = sorted(list(set(text)))
print('Total Number of Unique Characters:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars)) # Character to index
indices_char = dict((i, c) for i, c in enumerate(chars)) # Index to Character

Total Number of Unique Characters: 65


In [5]:
maxlen = 40 # Number of characters considered
step = 3 # Stide of our window
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    # The character just after the sequence is the label
    next_chars.append(text[i + maxlen]) 
print('nb sequences:', len(sentences))


nb sequences: 688888


In [6]:
print('Vectorization...')
# Initializing Tensor (training data)
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) 
# Initializing Output that holds next character (label)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool) 
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1 
    y[i, char_indices[next_chars[i]]] = 1


Vectorization...


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y = np.zeros((len(sentences), len(chars)), dtype=np.bool)


In [7]:
def sample(preds, temperature=1.0):
    """Perform Temperature Sampling"""
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature 
    exp_preds = np.exp(preds)
    # Softmax of predictions
    preds = exp_preds / np.sum(exp_preds) 
    probas = np.random.multinomial(1, preds, 1) 
    return np.argmax(probas)
def on_epoch_end(epoch, _):    
    # Save model weights into file
    model.save_weights('saved_weights.hdf5', overwrite=True)
    
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
checkpointer = ModelCheckpoint(filepath='/tmp/weights.hdf5', verbose=1, save_best_only=True)

In [8]:
print('Building model...')
# Size of vector in the hidden layer.
hidden_size = 128 
# Initialize Sequential Model
model = Sequential()
model.add(LSTM(hidden_size, input_shape=(maxlen, len(chars))))
# Add the output layer that is a softmax of the number of characters
model.add(Dense(len(chars), activation='softmax')) 
# Optimization through RMSprop
optimizer_new = RMSprop() 
# Consider cross Entropy loss. Why? MLE of P(D | theta)
model.compile(loss='categorical_crossentropy', optimizer=optimizer_new) 

# Train this for 30 epochs. Size of output from LSTM i.e. hidden layer vector shape=128
model.fit(x, y,
          batch_size=128,
          epochs=30,
          callbacks=[print_callback, checkpointer])

Building model...
Epoch 1/30
Epoch 2/30

KeyboardInterrupt: 