In [1]:
# Imports

from keras import Sequential
from keras.layers import LSTM, Dense, Dropout, Dropout, Activation, Reshape
from keras.losses import categorical_crossentropy
from sklearn import preprocessing
import numpy as np

In [2]:
# Sort

text = open("data/romeoandjuliet.txt", "r").read()
text = text[:round(len(text)/2)]

chardict = sorted(list(set(text)))

total = len(text)
chars = len(chardict)

print("Total Charaters        :", total)
print("Total Unique Charaters :", chars)

Total Charaters        : 26288
Total Unique Charaters : 64


In [3]:
# Format

chunklength = 25
step = 1
sentences = []
characters = []

for i in range(0, len(text) - chunklength, step):
    sentences.append(text[i : i + chunklength])
    characters.append(text[i + chunklength])

chunks = len(sentences)
print("Total Chunks :", chunks)

Total Chunks : 26263


In [4]:
# Sample

print("Sample Chunk     : ", sentences[0])
print("Sample Character : ", characters[0])

Sample Chunk     :  ROMEO AND JULIET

by Will
Sample Character :  i


In [5]:
# Format

x = np.zeros(chunks * chunklength * chars, np.bool).reshape(chunks, chunklength, chars)
y = np.zeros(chunks * 1 * chars, np.bool).reshape(chunks, 1, chars)

for i,v in enumerate(sentences):
    for a,b in enumerate(v):
        x[i][a][chardict.index(b)] = True

for i,v in enumerate(characters):
    y[i][0][chardict.index(v)] = True

print("Total Data Values  : ", chunks * chunklength * chars)
print("Total Label Values : ", chunks * 1 * chars)
print("X Shape :", x.shape)
print("Y Shape :", y.shape)

Total Data Values  :  42020800
Total Label Values :  1680832
X Shape : (26263, 25, 64)
Y Shape : (26263, 1, 64)


In [6]:
# Model

model = Sequential()
model.add(LSTM(2 * chars, return_sequences=True, input_shape=(chunklength, chars)))
model.add(Dense(chars))
model.add(Dropout(0.1))
model.add(Reshape((1, chunklength * chars)))
model.add(Dense(chars))
model.add(Activation("softmax"))

model.summary()

model.compile(optimizer="rmsprop", loss=categorical_crossentropy)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 25, 128)           98816     
_________________________________________________________________
dense (Dense)                (None, 25, 64)            8256      
_________________________________________________________________
dropout (Dropout)            (None, 25, 64)            0         
_________________________________________________________________
reshape (Reshape)            (None, 1, 1600)           0         
_________________________________________________________________
dense_1 (Dense)              (None, 1, 64)             102464    
_________________________________________________________________
activation (Activation)      (None, 1, 64)             0         
Total params: 209,536
Trainable params: 209,536
Non-trainable params: 0
__________________________________________________

In [19]:
# Train

model.fit(x=x, y=y, batch_size=chunklength, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x17d5059df70>

In [20]:
# User Input

userinput = chardict[0] * 5 or sentences[0]

userdata = np.zeros(chunklength * chars, np.bool).reshape(1, chunklength, chars)

for i,v in enumerate(userinput):
        userdata[0][i][chardict.index(v)] = True

In [21]:
# Prediction

inputdata = x[53].reshape(1, chunklength, chars)

prediction = model.predict(inputdata)[0]

print("Input shape : ", inputdata.shape)

Input shape :  (1, 25, 64)


In [24]:
# Clean

totalprediction = ""
length = 250

for i in range(length):

    cleaninput = []
    cleanprediction = []

    for a in prediction:
        bi, bv = 1, -1
        for i,v in enumerate(a):
            if v > bv:
                bv = v
                bi = i
        cleanprediction.append(chardict[bi])

    for a in inputdata:
        s = []
        for b in a:
            for i,v in enumerate(b):
                if v:
                    s.append(chardict[i])
        cleaninput.append("".join(s))

    # New Prediction

    newinput = "".join(list(i for i in cleaninput[0])[1:]) + cleanprediction[0]

    userdata = np.zeros(chunklength * chars, np.bool).reshape(1, chunklength, chars)

    for i,v in enumerate(newinput):
            userdata[0][i][chardict.index(v)] = True
    inputdata = userdata

    prediction = model.predict(inputdata)[0]

    totalprediction += cleanprediction[0]


In [25]:
print(totalprediction)

he to leap and sto the with his the farle the with his the part as and and the will at me the with his the tarl as and sto the with his light fare the with his the wall at you the wall at the to hear wellow the to the tare the faither the the with th
