In [123]:
# From https://stackabuse.com/text-generation-with-python-and-tensorflow-keras/

import re
import pandas as pd
import pickle
import numpy
import sys
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

# nltk.download('stopwords')

# Load titles
titles = pd.DataFrame(pickle.load(open("pickle/complete_features.p", "rb" )))['title'].tolist()
# and exclude any that don't start with a word character
regex = re.compile(r'^\w')
titles = list(filter(regex.search, titles))
# and convert to lowercase
titles = [t.lower() for t in titles]
# and deduplicate
titles = list(set(titles))

In [124]:
inputs = " ".join([t.lower() for t in titles])
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(inputs)

In [125]:
chars = sorted(list(set(" ".join(words))))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [126]:
words = words[-1000:]
len(words)

1000

In [127]:
inputs = " ".join(words)

input_len = len(inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)


seq_length = 100
x_data = []
y_data = []

Total number of characters: 5501
Total vocab: 65


In [128]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [129]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 5401


In [130]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [131]:
y = np_utils.to_categorical(y_data)

In [132]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [133]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [136]:
filepath = "title_model_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [135]:
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 3.33884, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 3.33884 to 3.09448, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 3.09448 to 3.07483, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 3.07483 to 3.06890, saving model to model_weights_saved.hdf5


<keras.callbacks.callbacks.History at 0x16bbdabeef0>

In [138]:
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [139]:
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [147]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" s reel the captain rock 2 bracken highland cutting bracken alte galopp c gl02207 ann s reel ein s us "
