In [0]:
import keras
import numpy as np

Using TensorFlow backend.


In [0]:
# from google.colab import files
# uploaded = files.upload()
path = keras.utils.get_file(
    'pg5200.txt',
    origin='http://www.gutenberg.org/cache/epub/5200/pg5200.txt')
text = open(path).read().lower()
print('Corpus length:', len(text))

Corpus length: 139056


# Char gen

In [0]:
import re
# text = open('eneida.txt').read().lower()
text = re.sub("[^\S\r\n]+", " ", text)
text = re.sub("–|;|\"", " ", text)
text = re.sub("[\n]+", "\n", text)
text = re.sub("\[\d+\]", "", text)

In [0]:
maxlen = 30
step = 3

sentences = []
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))
chars = sorted(list(set(text)))
print('Unique characters:', len(chars))
char_indices = dict((char, chars.index(char)) for char in chars)
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Number of sequences: 46015
Unique characters: 56
Vectorization...


In [0]:
from keras import layers
model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

In [0]:
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


In [0]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [0]:
import numpy as np
def reweight_distribution(original_distribution, temperature=0.5):
    distribution = np.log(original_distribution) / temperature
    distribution = np.exp(distribution)
    return distribution / np.sum(distribution)

In [0]:
import random
import sys

for epoch in range(1, 60):
  print('\nepoch', epoch)
  model.fit(x, y, batch_size=128, epochs=1)
  if epoch < 5:
    continue
  start_index = random.randint(0, len(text) - maxlen - 1)
  generated_text = text[start_index: start_index + maxlen]
  print('--- Generating with seed: "' + generated_text + '"')
  for temperature in [0.2, 0.5, 1.0]:
    print('------ temperature:', temperature)
    sys.stdout.write(generated_text)
    for i in range(400):
      sampled = np.zeros((1, maxlen, len(chars)))
      for t, char in enumerate(generated_text):
        sampled[0, t, char_indices[char]] = 1.
      preds = model.predict(sampled, verbose=0)[0]
      next_index = sample(preds, temperature)
      next_char = chars[next_index]
      generated_text += next_char
      generated_text = generated_text[1:]
      sys.stdout.write(next_char)

In [0]:
model.save_weights('my_model_weights.h5')

In [0]:
"""46015/46015 [==============================] - 17s 376us/step - loss: 0.7537
--- Generating with seed: "s friendly.  he's enjoyed his "
------ temperature: 0.2
s friendly.  he's enjoyed his room, then doorward be never tooke the tad off at himself with them and the chief clerk. 
he was a little when he had done than there was not any was also that his mother to the project 
gutenberg-tm work was something from the couch. one did not the window and whine here for the chest of the window, 
making his father and that was something for him any parable of the family he would be able to grego
------ temperature: 0.5
mily he would be able to gregor the pain with the couch. obo
the couch.
un the furniture and sention
middle of the bed, was that without defin door in the chair and dirked and that was so much of any stammest
thing would grete have to pother and was something first have to th, project
gutenberg-tm electronic work and that was to make thing covered it any of the bedly was happen at the couch, 
mor.ity any plaming ofa all the tame
------ temperature: 1.0
ty any plaming of all the tame doors. he can betted, was that gregor coversly by
the project gutenberg-tm work without the couch bitter,
and was amsalfaw agake us all the effort to be parents yon
could him - they was  mambe intany to get be pait project gutenberg-tm work before. 
we haven to get rem in her room without precuairned.
he was that would go dooward ran apperend for at the care thand from but on the floor to the proj"""



# Word gen

In [0]:
import re
import string
# text = open('eneida.txt').read().lower()
text = re.sub("\d+", "", text)
text = text.translate(str.maketrans('', '', string.punctuation))
text = re.sub("[\n]+", "\n", text)
text = re.sub("\[*\]", "", text)
text = re.sub("[^\S\r\n]+", " ", text)
rgx = re.compile("([\w][\w']*\w)")
text = rgx.findall(text)
text

In [0]:
len(set(text))

3026