In [1]:
import numpy as np

# creating set of unique characters and vocabulary with indexes
chars = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '.', ',', '!', '?', ':', ';', "\'", '(', ')', '-', ' ',  '\n']
vocabulary = {}
indexes = {}
one_hot_vectors = np.zeros((len(chars), len(chars)), dtype=float)
block_size = 40
index = 0
# creating indexed vocabulary
for char in chars:
    vocabulary[char] = index
    one_hot_vectors[index, index] = 1
    index += 1

# importing corpus and case folding vocabulary
def get_corpus(filename):
    file = open(filename)
    corpus = file.read().lower()
    return corpus

In [2]:
# creating samples 
def get_training_data(corpus):
    # slicing the corpus into inputs and labels
    inputs = [corpus[x:x+block_size] for x in range(0,len(corpus)-41,3)]
    outputs = [corpus[x:x+block_size] for x in range(1,len(corpus)-40,3)]
    return(inputs, outputs)

In [3]:
# creating one hot vectors from input and labels 
def get_samples(inputs, outputs):
    np.random.seed(0)
    X = np.zeros((len(inputs), block_size, len(vocabulary)), dtype=float)
    y = np.zeros((len(outputs), block_size, len(vocabulary)), dtype=float)
    for sample in range(len(inputs)):
        for char in range(40):
            X[sample, char, vocabulary[inputs[sample][char]]] = 1
    for sample in range(len(outputs)):
        for char in range(40):
            y[sample, char, vocabulary[inputs[sample][char]]] = 1
    return(X,y)

In [11]:
# converting one hot vectors into their character representation
def vec_to_char(vec):
    char = ''
    for i in range(38):
        if vec[i] == 1:
            char = chars[i]
    return char

# converting characters into their one hot vector representation
def char_to_vec():
    return one_hot_vectors[vocabulary[char]]
        

In [65]:
# getting corpus, inputs, lables, and training sets
corpus = get_corpus('shakespeare.txt')
inputs, outputs = get_training_data(corpus)
X, y = get_samples(inputs, outputs)
print('Vocabulary size:', len(vocabulary))
print('Number of training samples:', len(X))

Vocabulary size: 38
Number of training samples: 31212


In [34]:
import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras import models, Input
from tensorflow.keras.layers import LSTM, TimeDistributed, Dense
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
import os
# creating language model with recurrent neural network
model = Sequential()
model.add(Input(shape=(block_size, len(vocabulary))))
model.add(LSTM(256, return_sequences=True, dropout=0.2))
model.add(TimeDistributed(Dense(len(vocabulary), activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], run_eagerly=True)
print(model.summary())

# training model 
model.fit(X, y, epochs=100, verbose=2, batch_size=256)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 40, 256)           302080    
                                                                 
 time_distributed_3 (TimeDis  (None, 40, 38)           9766      
 tributed)                                                       
                                                                 
Total params: 311,846
Trainable params: 311,846
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
122/122 - 25s - loss: 2.3264 - accuracy: 0.4473 - 25s/epoch - 203ms/step
Epoch 2/100
122/122 - 27s - loss: 0.6741 - accuracy: 0.8521 - 27s/epoch - 217ms/step
Epoch 3/100
122/122 - 27s - loss: 0.5196 - accuracy: 0.8615 - 27s/epoch - 218ms/step
Epoch 4/100
122/122 - 26s - loss: 0.4775 - accuracy: 0.8668 - 26s/epoch - 214ms/step
Epoch 5/100
122/122 - 26s - loss:

Epoch 88/100
122/122 - 25s - loss: 0.2848 - accuracy: 0.9110 - 25s/epoch - 207ms/step
Epoch 89/100
122/122 - 25s - loss: 0.2834 - accuracy: 0.9115 - 25s/epoch - 208ms/step
Epoch 90/100
122/122 - 26s - loss: 0.2819 - accuracy: 0.9117 - 26s/epoch - 211ms/step
Epoch 91/100
122/122 - 25s - loss: 0.2817 - accuracy: 0.9120 - 25s/epoch - 209ms/step
Epoch 92/100
122/122 - 25s - loss: 0.2816 - accuracy: 0.9118 - 25s/epoch - 208ms/step
Epoch 93/100
122/122 - 25s - loss: 0.2811 - accuracy: 0.9120 - 25s/epoch - 205ms/step
Epoch 94/100
122/122 - 25s - loss: 0.2796 - accuracy: 0.9124 - 25s/epoch - 208ms/step
Epoch 95/100
122/122 - 25s - loss: 0.2784 - accuracy: 0.9130 - 25s/epoch - 207ms/step
Epoch 96/100
122/122 - 25s - loss: 0.2762 - accuracy: 0.9136 - 25s/epoch - 206ms/step
Epoch 97/100
122/122 - 25s - loss: 0.2774 - accuracy: 0.9130 - 25s/epoch - 207ms/step
Epoch 98/100
122/122 - 25s - loss: 0.2762 - accuracy: 0.9135 - 25s/epoch - 205ms/step
Epoch 99/100
122/122 - 25s - loss: 0.2767 - accuracy: 

<keras.callbacks.History at 0x1f80137b1f0>

In [62]:
# sampling from predictions
def get_sample(predictions, temperature):
    predictions = np.asarray(predictions).astype('float64')
    predictions = np.log(predictions) / temperature
    expected = np.exp(predictions)
    predictions = expected /np.sum(expected)
    probabilities = np.random.multinomial(1, predictions, size=1)
    return np.argmax(probabilites)

In [63]:
# generating text
def generate(model, seed):
    num_chars = 400
    text = seed
    seed = [seed]
    inputs, outputs = get_training_data(seed)
    X, y = get_samples(inputs, outputs)
    text = []
    temperature = .75
    
    #model.reset_states()
    for i in range(num_chars):
        print(seed)
        predictions = model.predict(X)
        yhat_index = sample(predictions, temperature)
        text.append(yhat)
        seed = [text[1:]]
        inputs, ouputs = get_training_data(seed)
        X, y = get_samples(inputs, outputs)
    return text
    

In [64]:
seed = 'hello good sir im glad to see you todays'
char2index = None
idx2char = np.array(chars)
text = generate(model, seed)
print(text)

['hello good sir im glad to see you todays']


ValueError: Unexpected result of `predict_function` (Empty batch_outputs). Please use `Model.compile(..., run_eagerly=True)`, or `tf.config.run_functions_eagerly(True)` for more information of where went wrong, or file a issue/bug to `tf.keras`.

In [50]:
# generating text using the model 
corpus_prediction = get_corpus('hold-out.txt')
seed = np.random.randint(0, len(corpus_prediction) - 40)
seed = ['hello good sir im glad to see you todays']
words = seed[0]

for i in range(20):
    inputs, outputs = get_training_data(seed)
    x_predict, y_predict = get_samples(inputs, outputs)
    predictions = model.predict(x_predict)
    prediction = np.argmax(predictions)
    #y_predict = [lambda x : vec_to_char(x) for x in prediction]
    for yhat in y_predict:
        for letter in yhat:
            words += vec_to_char(letter)
            print(words)
    seed = [words[1:]]
    print(seed)
print(words)

40
['hello good sir im glad to see you todays']


ValueError: Unexpected result of `predict_function` (Empty batch_outputs). Please use `Model.compile(..., run_eagerly=True)`, or `tf.config.run_functions_eagerly(True)` for more information of where went wrong, or file a issue/bug to `tf.keras`.