In [1]:
!pip install tensorflow



In [2]:
!pip install nltk



In [3]:
!pip install keras



In [4]:
#import dependencies
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense,Dropout,LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Koushik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.


In [5]:
#Load data
file = open("frankenstein-2.txt").read()

In [6]:
file

'London was our present point of rest; we determined to remain several months in this wonderful and celebrated city. Clerval desired the intercourse of the men of genius and talent who flourished at this time, but this was with me a secondary object; I was principally occupied with the means of obtaining the information necessary for the completion of my promise and quickly availed myself of the letters of introduction that I had brought with me, addressed to the most distinguished natural philosophers.\n\nIf this journey had taken place during my days of study and happiness, it would have afforded me inexpressible pleasure. But a blight had come over my existence, and I only visited these people for the sake of the information they might give me on the subject in which my interest was so terribly profound. Company was irksome to me; when alone, I could fill my mind with the sights of heaven and earth; the voice of Henry soothed me, and I could thus cheat myself into a transitory peace

In [7]:
#tokenization
#standardization
def tokenize_words(input):
    input=input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token:token not in stopwords.words('english'),tokens)
    return "".join(filtered)

processed_inputs = tokenize_words(file)

In [8]:
#chars to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i,c in enumerate(chars))

In [9]:
#check if words to chars or chars to num 
input_len=len(processed_inputs)
vocab_len=len(chars)
print("Total number of characters:",input_len)
print("Total vocab",vocab_len)

Total number of characters: 2030
Total vocab 26


In [10]:
#seq length
seq_length=100
x_data=[]
y_data=[]


In [11]:
#loop through the sequence
for i in range(0,input_len - seq_length,1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i+seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
    
n_patterns = len(x_data)
print("Total Patterns:",n_patterns)

Total Patterns: 1930


In [12]:
#convert input sequence to np array and so on
X =numpy.reshape(x_data,(n_patterns,seq_length,1))
X = X/float(vocab_len)

In [13]:
#one hot encoding
y = np_utils.to_categorical(y_data)

In [14]:
#creating the model
model = Sequential()
model.add(LSTM(256,input_shape=(X.shape[1],X.shape[2]),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1],activation='softmax'))

In [15]:
#compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [16]:
#saving the weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath,monitor='loss',verbose=1,save_best_only=True,mode='min')
desired_callbacks = [checkpoint]

In [17]:
#fit the model and train the data
model.fit(X,y,epochs=4,batch_size=256,callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 3.10530, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 3.10530 to 2.98297, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.98297 to 2.97194, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.97194 to 2.96493, saving model to model_weights_saved.hdf5


<keras.callbacks.callbacks.History at 0x1b6b498b448>

In [18]:
#recompile the model with same weights
filename="model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [19]:
#output of the model back to characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [20]:
#random seed to help generate
start = numpy.random.randint(0,len(x_data)-1)
pattern = x_data[start]
print("Random Seed: ")
print("\"",''.join([num_to_char[value] for value in pattern]),"\"")

Random Seed: 
" peditionintendfollowgreatroadedinburghvisitwindsoroxfordmatlockcumberlandlakesresolvingarrivecomplet "


In [23]:
#generate the text
for i in range(1000):
    x = numpy.reshape(pattern,(1,len(pattern),1))
    x = x/float(vocab_len)
    prediction =  model.predict(x,verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee