In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Dropout
from tensorflow.keras.layers import CuDNNLSTM as LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
import tensorflow.keras.utils as ku 

import pandas as pd
import numpy as np

import tensorflow.keras as keras
import tensorflow as tf

import random
import math
import re

In [None]:
config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 56} ) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

In [None]:
def get_sequence_of_tokens(text):
    tokenizer.fit_on_texts(text)
    total_words = len(tokenizer.word_index) + 1
    input_sequences = []
    for line in text:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    model.add(LSTM(700, return_sequences=True))
    model.add(Dropout(0.2))
    
    model.add(LSTM(700, return_sequences=True))
    model.add(Dropout(0.2))
    
    model.add(LSTM(700))
    model.add(Dropout(0.2))
    
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

In [None]:
tokenizer = Tokenizer()

In [None]:
text = (open("dylanThomas.txt").read())
text = text.lower()
words = list(filter(None,re.split('(\n)| ', text)))

words_list = [None] * math.floor((len(words) / 2))

for x in range(math.floor((len(words) / 2))):
    ind_start = random.randint(0, len(words)-7)
    ran = random.randint(4, 7)
    words_list[x] = ' '.join(words[ind_start:ind_start+ran])

In [None]:
inp_sequences, total_words = get_sequence_of_tokens(words_list)

In [None]:
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [None]:
model = create_model(max_sequence_len, total_words)
model.summary()

In [None]:
model.fit(predictors, label, epochs=200, batch_size=1000)

In [None]:
model.save_weights('dylanThomasAdam.h5')

In [None]:
generate_text("Poetry", 25, model, max_sequence_len).lower()