In [29]:
from __future__ import print_function
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import LambdaCallback, EarlyStopping, TensorBoard
import datetime
import random
import requests
import pandas as pd
import numpy as np
import sys
import os

In [7]:
url = "https://www.gutenberg.org/files/100/100-0.txt"

r = requests.get(url)
r.encoding = r.apparent_encoding
data = r.text
data = data.split('\r\n')
toc = [l.strip() for l in data[44:130:2]]
# Skip the Table of Contents
data = data[135:]

# Fixing Titles
toc[9] = 'THE LIFE OF KING HENRY V'
toc[18] = 'MACBETH'
toc[24] = 'OTHELLO, THE MOOR OF VENICE'
toc[34] = 'TWELFTH NIGHT: OR, WHAT YOU WILL'

locations = {id_:{'title':title, 'start':-99} for id_,title in enumerate(toc)}

# Start 
for e,i in enumerate(data):
    for t,title in enumerate(toc):
        if title in i:
            locations[t].update({'start':e})
            

df_toc = pd.DataFrame.from_dict(locations, orient='index')
# Convert to Dataframe
df_toc['end'] = df_toc['start'].shift(-1).apply(lambda x: x-1)
df_toc.loc[42, 'end'] = len(data)
df_toc['end'] = df_toc['end'].astype('int')

df_toc['text'] = df_toc.apply(lambda x: '\r\n'.join(data[ x['start'] : int(x['end']) ]), axis=1)

In [8]:
df_toc

Unnamed: 0,title,start,end,text
0,THE TRAGEDY OF ANTONY AND CLEOPATRA,-99,14379,
1,AS YOU LIKE IT,14380,17171,AS YOU LIKE IT\r\n\r\n\r\nDRAMATIS PERSONAE.\r...
2,THE COMEDY OF ERRORS,17172,20372,THE COMEDY OF ERRORS\r\n\r\n\r\n\r\nContents\r...
3,THE TRAGEDY OF CORIOLANUS,20373,30346,THE TRAGEDY OF CORIOLANUS\r\n\r\nDramatis Pers...
4,CYMBELINE,30347,30364,CYMBELINE.\r\nLaud we the gods;\r\nAnd let our...
5,"THE TRAGEDY OF HAMLET, PRINCE OF DENMARK",30365,37051,"THE TRAGEDY OF HAMLET, PRINCE OF DENMARK\r\n\r..."
6,THE FIRST PART OF KING HENRY THE FOURTH,37052,41767,THE FIRST PART OF KING HENRY THE FOURTH\r\n\r\...
7,THE SECOND PART OF KING HENRY THE FOURTH,41768,-100,THE SECOND PART OF KING HENRY THE FOURTH\r\n\r...
8,THE LIFE OF KING HENRY THE FIFTH,-99,45176,
9,THE LIFE OF KING HENRY V,45177,53383,THE LIFE OF KING HENRY V\r\n\r\n\r\n\r\nConten...


In [11]:
df_toc[df_toc['title'].str.match('OTHELLO, THE MOOR OF VENICE')]

Unnamed: 0,title,start,end,text
24,"OTHELLO, THE MOOR OF VENICE",103827,114239,"OTHELLO, THE MOOR OF VENICE\r\n\r\n\r\n\r\nCon..."


In [12]:
locations[24]

{'start': 103827, 'title': 'OTHELLO, THE MOOR OF VENICE'}

In [14]:
othello = df_toc['text'][24]
len(othello)

275333

In [15]:
chars = list(set(othello))

char_int = {c:i for i,c in enumerate(chars)}
int_char = {i:c for i,c in enumerate(chars)}

print(f"Othello contains {len(chars)} unique characters.")

Othello contains 74 unique characters.


In [17]:
maxlen = 150
step = 1

encoded = [char_int[c] for c in othello]

sequences = [] # Each element is 40 characters long
next_chars = [] # One element for each sequence

for i in range(0, len(encoded) - maxlen, step):
    sequences.append(encoded[i : i + maxlen])
    next_chars.append(encoded[i + maxlen])
    
print('sequences:', len(sequences))

sequences: 275183


In [18]:
x = np.zeros((len(sequences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sequences), len(chars)), dtype=np.bool)

for i, sequence in enumerate(sequences):
    for t, char in enumerate(sequence):
        x[i,t,char] = 1
        
    y[i, next_chars[i]] = 1

In [19]:
print(x.shape, y.shape)

(275183, 150, 74) (275183, 74)


In [20]:
model = Sequential()
model.add(LSTM(256, input_shape=(maxlen, len(chars)), dropout=0.2))
model.add(Dense(len(chars), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='nadam')

In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256)               338944    
_________________________________________________________________
dense (Dense)                (None, 74)                19018     
Total params: 357,962
Trainable params: 357,962
Non-trainable params: 0
_________________________________________________________________


In [22]:
def sample(preds):
  
    """helper function to sample an index from a probability array"""
    
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / 1
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [23]:
def on_epoch_end(epoch, _):
    """Function invoked at end of each epoch. Prints generated text"""
    
    print()
    print('----- Generating text after Epoch: %d' % epoch)
    
    start_index = random.randint(0, len(othello) - maxlen - 1)
    
    generated = ''
    
    sentence = othello[start_index: start_index + maxlen]
    generated += sentence
    
    print('----- Generating with seed: ----- \n')
    sys.stdout.write(generated)
    
    print('\n\n\n-----New text: -----')

    for i in range(400):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_int[char]] = 1
            
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds)
        next_char = int_char[next_index]
        
        sentence = sentence[1:] + next_char
        
        sys.stdout.write(next_char)
    print()
    sys.stdout.flush()
    print('\n\n')
    
    
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [30]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = TensorBoard(logdir, histogram_freq=1)

model.fit(x, y,
          batch_size=128,
          validation_split=.2,
          epochs=10,
          callbacks=[print_callback, 
                     #EarlyStopping(min_delta=.02, monitor='val_loss', patience=10),
                     tensorboard_callback])

Epoch 1/10
----- Generating text after Epoch: 0
----- Generating with seed: ----- 

mit unto, — our sovereign.

ALL.
Live, noble Helicane!

HELICANUS.
For honour’s cause, forbear your suffrages:
If that you love Prince Pericles


-----New text: -----
 om hemats eees ’th thahbrens,
O dy for winen, mo to you by good Ware.
Thath she hay wial iel merin thinger’d Dead;
Ih ’tich: she lald lodent’d mpind,

And a aidghei. Dimbingbow ey,


DESDEMONA.
O, my heavy, and with hers tho, pin to morl at of ithand
buth mose, cather. Bie Clord,
Thad for shame theneet’h  stild. Low bothut cour;
We el



Epoch 2/10
----- Generating text after Epoch: 1
----- Generating with seed: ----- 

re
ruffians, you’ll have your daughter cover’d with a Barbary horse;
you’ll have your nephews neigh to you; you’ll have coursers for cousins
and ge


-----New text: -----



Thiseeg love of Eyain aloul:
And I’st of a shals, and amod crmed,
AFisen for goll, flr the glamat-somad ie.
Tbal ir fexheaven teet! rhabrasteslenenne


<tensorflow.python.keras.callbacks.History at 0x7f30b9407080>