In [24]:
# importing packages

import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout, GRU
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import string
from pickle import dump
from pickle import load
from keras.models import load_model
from random import randint

# function for loading .txt file from the Kaggle input directory
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# function for loading .txt file from the Kaggle output directory
def load_dockaggle(filename):
    filepath = '/kaggle/working/' + filename
    with open(filepath, 'r') as file:
        text = file.read()
    return text

def clean_doc(doc):
    doc = doc.replace('--', ' ')
    tokens = doc.split()
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens

def save_doc(lines, filename):
    out_filename = os.path.splitext(os.path.basename(filename))[0] + '_sequences.txt'
    out_filepath = os.path.join('/kaggle/working', out_filename)
    with open(out_filepath, 'w') as file:
        file.write('\n'.join(lines))
    
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        yhat = np.argmax(model.predict(encoded, verbose = 0), axis = 1)
        # https://stackoverflow.com/questions/68836551/keras-attributeerror-sequential-object-has-no-attribute-predict-classes
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)
            

In [3]:
# initializing the initial .txt file
my_file = '/kaggle/input/philosopher-stone/philosopherstone.txt'
doc = load_doc(my_file)
print(doc[:200])

THE BOY WHO LIVED 

Mr. and Mrs. Dursley, of number four, Privet Drive, 
were proud to say that they were perfectly normal, 
thank you very much. They were the last people you’d 
expect to be involved


In [4]:
# cleaning the document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['the', 'boy', 'who', 'lived', 'mr', 'and', 'mrs', 'dursley', 'of', 'number', 'four', 'privet', 'drive', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', 'thank', 'you', 'very', 'much', 'they', 'were', 'the', 'last', 'people', 'expect', 'to', 'be', 'involved', 'in', 'anything', 'strange', 'or', 'mysterious', 'because', 'they', 'just', 'hold', 'with', 'such', 'nonsense', 'mr', 'dursley', 'was', 'the', 'director', 'of', 'a', 'firm', 'called', 'grunnings', 'which', 'made', 'drills', 'he', 'was', 'a', 'big', 'beefy', 'man', 'with', 'hardly', 'any', 'neck', 'although', 'he', 'did', 'have', 'a', 'very', 'large', 'mustache', 'mrs', 'dursley', 'was', 'thin', 'and', 'blonde', 'and', 'had', 'nearly', 'twice', 'the', 'usual', 'amount', 'of', 'neck', 'which', 'came', 'in', 'very', 'useful', 'as', 'she', 'spent', 'so', 'much', 'of', 'her', 'time', 'craning', 'over', 'garden', 'fences', 'spying', 'on', 'the', 'neighbors', 'the', 'dursley', 's', 'had', 'a', 'small', 'son',

In [5]:
seq_len = 50
# organize into sequences of tokens
length = seq_len + 1
sequences = list()
for i in range(length, len(tokens)):
 # select sequence of tokens
 seq = tokens[i-length:i]
 # convert into a line
 line = ' '.join(seq)
 # store
 sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 73592


In [6]:
# save sequences to file
out_filename = my_file[:-4] + '_sequences.txt'
save_doc(sequences, out_filename)

in_filename = 'philosopherstone_sequences_sequences.txt'
doc = load_dockaggle(in_filename)
lines = doc.split('\n')

tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

vocab_size = len(tokenizer.word_index) + 1

sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes = vocab_size)
seq_length = X.shape[1]

In [10]:
# LSTM Model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length = seq_length))
model.add(LSTM(100, return_sequences = True))
model.add(LSTM(100, return_sequences = True))
model.add(LSTM(100))
model.add(Dense(100, activation = 'relu'))
model.add(Dense(vocab_size, activation = 'softmax'))
print(model.summary())

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

history = model.fit(X, y, batch_size = 32, epochs = 10)

model.save('model.h5')

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 50)            278100    
                                                                 
 lstm_3 (LSTM)               (None, 50, 100)           60400     
                                                                 
 lstm_4 (LSTM)               (None, 50, 100)           80400     
                                                                 
 lstm_5 (LSTM)               (None, 100)               80400     
                                                                 
 dense_2 (Dense)             (None, 100)               10100     
                                                                 
 dense_3 (Dense)             (None, 5562)              561762    
                                                                 
Total params: 1,071,162
Trainable params: 1,071,162
No

In [17]:
# generating text through the LSTM model
dump(tokenizer, open('tokenizer.pkl', 'wb'))

seq_length = len(lines[0].split()) - 1

model = load_model('model.h5')
tokenizer = load(open('tokenizer.pkl', 'rb'))

for i in range(10):
    seed_text = lines[randint(0, len(lines))]

    print("Seed", i + 1, ": ")
    print(seed_text + '\n')
    print("------------------------------------------------------------")

    generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
    seed_text = generated
    print(generated)
    print("\n------------------------------------------------------------")
    print("\nDone.")
    print("\n")

Seed 1 : 
around alone but different i look older and head page harry potter and the philosophers stone jk rowling am wearing the badge like bill used to and holding the house cup and the quidditch cup quidditch captain ron tore his eyes away from this splendid sight to look excitedly at harry

------------------------------------------------------------
and a few hall and a bit of the first thing he was a bit of the first thing he was a bit of the first thing he was a bit of the first thing he was a bit of the first thing he was a bit of the first

------------------------------------------------------------

Done.


Seed 2 : 
b book on vampires he looked terrified at the very thought but the others let professor quirrell keep harry to himself it took almost ten minutes to get away from them all at last hagrid managed to make himself heard over the babble page harry potter and the philosophers stone jk

------------------------------------------------------------
rowling the boy was a

In [21]:
# GRU model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(GRU(100, return_sequences=True))
model.add(GRU(100, return_sequences=True))
model.add(GRU(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

print(model.summary())

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

history = model.fit(X, y, batch_size = 32, epochs = 10)

model.save('model_gru.h5')

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 50, 50)            278100    
                                                                 
 gru_6 (GRU)                 (None, 50, 100)           45600     
                                                                 
 gru_7 (GRU)                 (None, 50, 100)           60600     
                                                                 
 gru_8 (GRU)                 (None, 100)               60600     
                                                                 
 dense_8 (Dense)             (None, 100)               10100     
                                                                 
 dense_9 (Dense)             (None, 5562)              561762    
                                                                 
Total params: 1,016,762
Trainable params: 1,016,762
No

In [23]:
# generating text through the GRU model
dump(tokenizer, open('tokenizer.pkl', 'wb'))

seq_length = len(lines[0].split()) - 1

model = load_model('model_gru.h5')
tokenizer = load(open('tokenizer.pkl', 'rb'))

for i in range(10):
    seed_text = lines[randint(0, len(lines))]

    print("Seed", i + 1, ": ")
    print(seed_text + '\n')
    print("------------------------------------------------------------")

    generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
    seed_text = generated
    print(generated)
    print("\n------------------------------------------------------------")
    print("\nDone.")
    print("\n")

Seed 1 : 
still muttering curious said harry mr ollivander fixed harry with his pale stare remember every wand ever sold mr potter every single wand it so happens that the phoenix whose tail feather is in your wand gave another feather just one other it is very curious indeed that you should be

------------------------------------------------------------
the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the

------------------------------------------------------------

Done.


Seed 2 : 
from wizarding families talked about quidditch constantly ron had already had a big argument with dean thomas who shared their dormitory about soccer ron see what was exciting about a game with only one ball where no one was allowed to fly harry had caught ron prodding poster of west ham

------------------------------------------------------------
the the th