In [7]:
import spacy
import urllib

nlp = spacy.load('en',disable=['parser', 'tagger','ner'])

# Load and tokenize

In [45]:
harry_potter_text = urllib. request. urlopen(r"https://raw.githubusercontent.com/formcept/whiteboard/master/nbviewer/notebooks/data/harrypotter/Book%201%20-%20The%20Philosopher's%20Stone.txt")
harry_potter_text = harry_potter_text.read().decode("utf-8") 
harry_potter_text[:100]

'/ \n\n\n\n\nTHE BOY WHO LIVED \n\nMr. and Mrs. Dursley, of number four, Privet Drive, \nwere proud to say th'

In [46]:
# removing unwanted characters

def remove_punc(text):
    return [token.text.lower() for token in nlp(text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n \\xe2  ']

harry_potter_tokens = remove_punc(harry_potter_text)
harry_potter_tokens[:10], len(harry_potter_tokens)


(['\n\n\n\n\n',
  'the',
  'boy',
  'who',
  'lived',
  'mr.',
  'and',
  'mrs.',
  'dursley',
  'of'],
 90981)

# Create sequence of tokens

In [48]:
def create_sequence_of_tokens(tokens, training_length):
  text_sequences = []

  for i in range(training_length, len(tokens)):
      sequence = tokens[i-training_length:i]
      text_sequences.append(sequence)

  return text_sequences

harry_potter_sequences = create_sequence_of_tokens(harry_potter_tokens, 26)
harry_potter_sequences_in_words = harry_potter_sequences
' '.join(harry_potter_sequences[99]), len(harry_potter_sequences)

('came in very useful as she spent so much of her time craning over garden fences spying on the neighbors the dursley s had a small',
 90955)

# Prepare data

In [19]:
from keras.preprocessing.text import Tokenizer
import numpy as np

In [20]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(harry_potter_sequences)
harry_potter_sequences = tokenizer.texts_to_sequences(harry_potter_sequences)

In [21]:
for i in harry_potter_sequences[99]:
    print(f'{i} : {tokenizer.index_word[i]}')

168 : came
14 : in
85 : very
1337 : useful
25 : as
58 : she
852 : spent
63 : so
167 : much
9 : of
81 : her
113 : time
2509 : craning
87 : over
1338 : garden
3380 : fences
2010 : spying
23 : on
1 : the
2510 : neighbors
1 : the
224 : dursley
2011 : s
20 : had
7 : a
357 : small


In [22]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

6171

In [23]:
# Create Numpy array

harry_potter_sequences = np.array(harry_potter_sequences)
harry_potter_sequences

array([[  46,    1,  154, ..., 1099,   12,   85],
       [   1,  154,   74, ...,   12,   85,  167],
       [ 154,   74, 1100, ...,   85,  167,   21],
       ...,
       [  24,  564,   16, ...,    1,   40,   30],
       [ 564,   16,  112, ...,   40,   30,   43],
       [  16,  112,  102, ...,   30,   43,   41]])

## Split into train/test 

In [24]:
from tensorflow.keras.utils import to_categorical

X = harry_potter_sequences[:,:-1]
y = harry_potter_sequences[:, -1]

y = to_categorical(y, num_classes=vocabulary_size+1)
seq_len = X.shape[1]


# Create model

In [25]:
import tensorflow as tf

In [33]:
# Set seed
tf.random.set_seed(42)

# Create model
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(vocabulary_size+1, 25,
                                                       input_length =seq_len),
                             tf.keras.layers.LSTM(units=150, 
                                                  return_sequences=True),
                             tf.keras.layers.LSTM(units=150),
                             tf.keras.layers.Dense(150, activation='relu'),
                             tf.keras.layers.Dense(vocabulary_size+1, activation='softmax')
])

# Compile model
model.compile(optimizer=tf.optimizers.Adam(),
              loss=tf.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

# Fit model
history = model.fit(X, y, batch_size=128, epochs=25, verbose=1)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [34]:
from pickle import dump, load

# save model
model.save('harry_potter_text_generator.h5')

# save tokenizer
dump(tokenizer, open('tokenizer', 'wb'))

# Generate text

In [58]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [139]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
   
    
    # Created
    created_text = []
    
    input_text = seed_text
    
    # Generate words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        created_text.append(pred_word)
        
   
    return ' '.join(created_text)

In [140]:
import random
random.seed(1)
random_pick = random.randint(0,len(harry_potter_sequences))

In [141]:
random_seed_text = harry_potter_sequences_in_words[random_pick]

In [142]:
seed_text = ' '.join(random_seed_text)
seed_text

'the next second dudley was dancing on the spot with his hands clasped over his fat bottom howling in pain when he turned his back on'

In [143]:
generate_text = generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=20)




In [144]:
# load colorama for better visualization
try:
  from colorama import Fore, Back, Style
  print('colorama loaded')
except:
  !pip install colorama
  from colorama import Fore, Back, Style
  print('colorama has been installed and loaded')

colorama loaded


In [145]:
print( Fore.RED + Back.CYAN +'SEED TEXT --->' + Fore.WHITE, Back.BLACK + seed_text)
print(Fore.YELLOW, Back.GREEN + generate_text + Fore.RED + Back.CYAN +'  <--- GENERATED TEXT')
print(Style.RESET_ALL)

[31m[46mSEED TEXT --->[37m [40mthe next second dudley was dancing on the spot with his hands clasped over his fat bottom howling in pain when he turned his back on
[33m [42mthe air and then he was n’t going to be able to find the way of complicated as he was[31m[46m  <--- GENERATED TEXT
[0m
