Importing required libraries

In [4]:
import numpy
import sys
import nltk
nltk.download('stopwords')
import tensorflow
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout, LSTM
from keras.utils import np_utils
from tensorflow.keras.layers import Input
from tensorflow.keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Using TensorFlow backend.


In [5]:
# loading data
file= open('frankenstein.txt').read()

In [6]:
# tokenisation and standardisation
def tokenize_words(input):
    input=input.lower()
    tokenizer= RegexpTokenizer(r'\w+')
    tokens= tokenizer.tokenize(input)
    filtered= filter(lambda token:token not in stopwords.words('english'),tokens)
    return "".join(filtered)
    
processed_inputs= tokenize_words(file)

In [7]:
#characters to numbers
chars= sorted(list(set(processed_inputs)))
char_to_num= dict((c,i) for i, c in enumerate(chars))

In [8]:
#check if words to chars or chars to num has worked
input_len= len(processed_inputs)
vocab_len= len(chars)
print('Total number of characters:',input_len)
print('Total vocab:',vocab_len)

Total number of characters: 220857
Total vocab: 42


In [9]:
#seq length
seq_length=100
x_data=[]
y_data=[]

In [10]:
#looping through the sequence
for i in range(0,input_len-seq_length,1):
    in_seq= processed_inputs[i:i+seq_length]
    out_seq=processed_inputs[i+seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

n_patterns= len(x_data)
print('Total patterns:',n_patterns)

Total patterns: 220757


In [11]:
#converting input seq into numpy array
x= numpy.reshape(x_data,(n_patterns,seq_length,1))
x= x/float(vocab_len)

In [12]:
# one-hot encoding
y= np_utils.to_categorical(y_data)

In [13]:
#creating the model
model = Sequential()
model.add(LSTM(256,input_shape=(x.shape[1],x.shape[2]),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256,return_sequences=True))
model.add(Dropout(0.2))          
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1],activation='softmax'))       

In [14]:
#compiling the model
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [15]:
#saving weights
filepath= 'model_weights_saved.hdf5'
checkpoint= ModelCheckpoint(filepath, monitor='loss',verbose=1,save_best_only=True,mode='min')
desired_callbacks= [checkpoint]

In [16]:
# fitting model and waiting for it to get trained
model.fit(x,y,epochs=4,batch_size=256,callbacks=desired_callbacks)

Epoch 1/4
Epoch 00001: loss improved from inf to 2.93011, saving model to model_weights_saved.hdf5
Epoch 2/4
Epoch 00002: loss improved from 2.93011 to 2.90790, saving model to model_weights_saved.hdf5
Epoch 3/4
Epoch 00003: loss improved from 2.90790 to 2.89644, saving model to model_weights_saved.hdf5
Epoch 4/4
Epoch 00004: loss improved from 2.89644 to 2.86463, saving model to model_weights_saved.hdf5


<tensorflow.python.keras.callbacks.History at 0x7f201119fe10>

In [17]:
# recompiling the model with saved weights
filename= 'model_weights_saved.hdf5'
model.load_weights(filename)
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [19]:
# output of the model back to character
num_to_char= dict((i,c) for i,c in enumerate(chars))

In [21]:
# random seed to help in generation
start= numpy.random.randint(0,len(x_data)-1)
pattern= x_data[start]
print('Random Seed:')
print("\"", ''.join([num_to_char[value] for value in pattern]),"\"")

Random Seed:
" ndencephilosopherscountrywhoseknowledgediscoveriesindispensableusepresentundertakinglattermethodobta "


In [22]:
# generating text
for i in range(1000):
    x= numpy.reshape(pattern, (1,len(pattern),1))
    x=x/float(vocab_len)
    prediction= model.predict(x,verbose=0)
    index= numpy.argmax(prediction)
    result= num_to_char[index]
    seq_in=[num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern=pattern[1:len(pattern)]

reeneeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee