<a href="https://colab.research.google.com/github/mashruravi/eip-notes/blob/master/EIP_Phase_2_Assignment_6_Ravi_Suresh_Mashru.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generating Text from Sherlock Holmes

In [1]:
!pip install tensorflow-gpu==1.13.1



In [0]:
import numpy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout, InputLayer
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [0]:
# Read the file contents and convert them to lowercase
filename = 'wonderland.txt'
raw_text = open(filename).read()
raw_text = raw_text.lower()

# Replace newline characters with space
raw_text = re.sub('\n', ' ', raw_text)

# Replace multiple whitespaces with a single one
raw_text = re.sub('\s+', ' ', raw_text)

In [0]:
# Create a mapping of unique characters to integers
chars = sorted(list(set(raw_text)))
chars = list(filter(lambda x: re.search(r'[0-9a-z\., ]', x), chars))

# Create char -> int and int -> char mappings
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [5]:
chars

[' ',
 ',',
 '.',
 '0',
 '3',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [6]:
n_chars = len(raw_text)
n_vocab = len(chars)

print('Total characters in text: {}'.format(n_chars))
print('Total characters in vocabulary: {}'.format(n_vocab))

Total characters in text: 142492
Total characters in vocabulary: 31


In [7]:
THRESHOLD=20

story_lines = raw_text.split('.')

# The length of the padded sequences will be the maximum line length
max_line_len = max(map(lambda x: len(x), story_lines))
print('Max line length: ', max_line_len)

sequencesX = []
sequencesY = []

for story_line in story_lines:
  
  # Remove special characters from line
  all_chars = list(story_line)
  valid_chars = [c for c in all_chars if c in chars]
  
  # Ignore lines that have length less than THRESHOLD
  if len(valid_chars) > THRESHOLD:
    valid_chars_int=[char_to_int[c] for c in valid_chars]
    for i in range(len(valid_chars_int)-1):
      sequencesX.append(valid_chars_int[:i+1])
      sequencesY.append(valid_chars_int[i+1])
      

# Pad sequences in sequencesX with maximum line length  
dataX = pad_sequences(sequencesX, maxlen=max_line_len)
dataY = [[x] for x in sequencesY]

Max line length:  1516


In [8]:
n_patterns = len(dataX)
print('Total patterns: {}'.format(n_patterns))

# Reshape X to be [batch size, time steps, features] <-- required by LSTM
X = numpy.reshape(dataX, (n_patterns, max_line_len, 1))

# Scale data to be between 0 and 1
X = X / float(n_vocab)

# One-hot encode the output
y = np_utils.to_categorical(dataY)

Total patterns: 135228


In [0]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15)

In [21]:
# Create a model
model = Sequential()
model.add(InputLayer(input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.1))
model.add(LSTM(512, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(y.shape[1], activation='softmax'))

import os
import tensorflow

tpu_model = tensorflow.contrib.tpu.keras_to_tpu_model(
  model,
  strategy=tensorflow.contrib.tpu.TPUDistributionStrategy(
    tensorflow.contrib.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
  )
)

tpu_model.compile(loss='categorical_crossentropy', optimizer='adam')

INFO:tensorflow:Querying Tensorflow master (grpc://10.6.233.18:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 3220007950122983503)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 6243412164784341985)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 11512458664191658974)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 5876125204348917811)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 16335538802777417544)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/

In [0]:
# Checkpoint to save the best model
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [23]:
tpu_model.fit(X_train, y_train, epochs=20, batch_size=128, validation_data=[X_val, y_val], callbacks=callbacks_list)

Train on 114943 samples, validate on 20285 samples
Epoch 1/20
INFO:tensorflow:New input shapes; (re-)compiling: mode=train (# of cores 8), [TensorSpec(shape=(16,), dtype=tf.int32, name='core_id_20'), TensorSpec(shape=(16, 1516, 1), dtype=tf.float32, name='input_2_10'), TensorSpec(shape=(16, 31), dtype=tf.float32, name='dense_2_target_10')]
INFO:tensorflow:Overriding default placeholder.
INFO:tensorflow:Cloning Adam {'lr': 0.0010000000474974513, 'beta_1': 0.8999999761581421, 'beta_2': 0.9990000128746033, 'decay': 0.0, 'epsilon': 1e-07, 'amsgrad': False}
INFO:tensorflow:Remapping placeholder for input_2
INFO:tensorflow:KerasCrossShard: <tensorflow.python.keras.optimizers.Adam object at 0x7f16cbca6978> []
INFO:tensorflow:Started compiling
INFO:tensorflow:Finished compiling. Time elapsed: 4.3230369091033936 secs
INFO:tensorflow:Setting weights on TPU model.
INFO:tensorflow:CPU -> TPU lr: 0.0010000000474974513 {0.001}
INFO:tensorflow:CPU -> TPU beta_1: 0.8999999761581421 {0.9}
INFO:tensorfl

<tensorflow.python.keras.callbacks.History at 0x7f16cbeba668>

In [35]:
tpu_model.fit(X_train, y_train, epochs=40, batch_size=128, validation_data=[X_val, y_val], callbacks=callbacks_list)

Train on 114943 samples, validate on 20285 samples
Epoch 1/40
Epoch 00001: loss improved from 2.33246 to 2.29652, saving model to weights-improvement-01-2.2965.hdf5
INFO:tensorflow:Copying TPU weights to the CPU
INFO:tensorflow:TPU -> CPU lr: 0.0010000000474974513
INFO:tensorflow:TPU -> CPU beta_1: 0.8999999761581421
INFO:tensorflow:TPU -> CPU beta_2: 0.9990000128746033
INFO:tensorflow:TPU -> CPU decay: 0.0
INFO:tensorflow:TPU -> CPU epsilon: 1e-07
INFO:tensorflow:TPU -> CPU amsgrad: False
Epoch 2/40
Epoch 00002: loss did not improve from 2.29652
Epoch 3/40
Epoch 00003: loss did not improve from 2.29652
Epoch 4/40
Epoch 00004: loss did not improve from 2.29652
Epoch 5/40
Epoch 00005: loss did not improve from 2.29652
Epoch 6/40
Epoch 00006: loss did not improve from 2.29652
Epoch 7/40
Epoch 00007: loss did not improve from 2.29652
Epoch 8/40
Epoch 00008: loss did not improve from 2.29652
Epoch 9/40
Epoch 00009: loss did not improve from 2.29652
Epoch 10/40
Epoch 00010: loss did not imp

<tensorflow.python.keras.callbacks.History at 0x7f16c86c9da0>

In [40]:
tpu_model.fit(X_train, y_train, epochs=40, batch_size=128, validation_data=[X_val, y_val], callbacks=callbacks_list)

Train on 114943 samples, validate on 20285 samples
Epoch 1/40
Epoch 00001: loss improved from 2.28421 to 2.28251, saving model to weights-improvement-01-2.2825.hdf5
INFO:tensorflow:Copying TPU weights to the CPU
INFO:tensorflow:TPU -> CPU lr: 0.0010000000474974513
INFO:tensorflow:TPU -> CPU beta_1: 0.8999999761581421
INFO:tensorflow:TPU -> CPU beta_2: 0.9990000128746033
INFO:tensorflow:TPU -> CPU decay: 0.0
INFO:tensorflow:TPU -> CPU epsilon: 1e-07
INFO:tensorflow:TPU -> CPU amsgrad: False
Epoch 2/40
Epoch 00002: loss improved from 2.28251 to 2.28156, saving model to weights-improvement-02-2.2816.hdf5
INFO:tensorflow:Copying TPU weights to the CPU
INFO:tensorflow:TPU -> CPU lr: 0.0010000000474974513
INFO:tensorflow:TPU -> CPU beta_1: 0.8999999761581421
INFO:tensorflow:TPU -> CPU beta_2: 0.9990000128746033
INFO:tensorflow:TPU -> CPU decay: 0.0
INFO:tensorflow:TPU -> CPU epsilon: 1e-07
INFO:tensorflow:TPU -> CPU amsgrad: False
Epoch 3/40
Epoch 00003: loss improved from 2.28156 to 2.27931

<tensorflow.python.keras.callbacks.History at 0x7f16c821a550>

In [0]:
model.compile(optimizer='adam', loss='categorical_crossentropy')
model.load_weights('weights-improvement-40-2.2336.hdf5')

In [46]:
# Create seed text
seed_text='once upon a time'

# Convert seed text to integers
seed_text_int=[char_to_int[x] for x in seed_text]

# Pad the seed text
padded_seed=pad_sequences([seed_text_int], maxlen=max_line_len)

# Predict 500 characters
pattern = padded_seed[0]
predicted_text = ''
for i in range(500):
  x = numpy.reshape(pattern, (1, len(pattern), 1))
  x = x / float(n_vocab)
  prediction = model.predict(x, batch_size=8)
  index = numpy.argmax(prediction)
  result = int_to_char[index]
  predicted_text+=result
  pattern = numpy.append(pattern[1:], [index])
  
print(seed_text, predicted_text)

once upon a time  to he the todee the woole the was  od the wooder whet iar he  oo  he  oe the aoa sand ehr the war a lott  he the woocer whet ian eere and the was a  ie the wooder taid the hotse  nu wo  ae io a lott  he a  ie to  e d toeale  oe the was a  a  ie a taid the was a  a  i  and the said the had and e  ane the tooe of the taid the ho   ne the aod e dou soeeze a  ie the wesy onteen a  ie the woode the wod th  noc to  ena the toog the thb ooce turel e  and the was a  a  i  and the wooder a              
