<a href="https://colab.research.google.com/github/nredick/mais-hacks-2021/blob/main/src/poem_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!wget --no-check-certificate \
    https://raw.githubusercontent.com/nredick/mais-hacks-2021/textgen/data/poemdataset/_sortedpoems.txt

In [17]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import numpy as np 
import random

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

data = open('_sortedpoems.txt', 'r+').readlines()

corpus = [line.lower() for line in data][:20000]

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

print(tokenizer.word_index)
print(total_words)

import pickle
with open('/content/drive/MyDrive/data/tokenizer.pkl',"wb") as f:
  pickle.dump(tokenizer,f)

Get a sense of the corpus and the lengths of each line.

In [None]:
print(len(corpus))
print(len(tokenizer.texts_to_sequences(corpus[0])))
print(len(tokenizer.texts_to_sequences(corpus[1])))
print(len(tokenizer.texts_to_sequences(corpus[2])))
print(len(tokenizer.texts_to_sequences(corpus[3])))

Preprocessing the data and creating the training vectors.

In [39]:
input_sequences = []
for line in corpus:
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
		n_gram_sequence = token_list[:i+1]
		input_sequences.append(n_gram_sequence)

# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# create predictors and label
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

See the vocabulary and maximum sentence length. All vectors are padded (from the left) to match this length.

In [None]:
print(tokenizer.word_index)
print(max_sequence_len)

Create the model and train it.

In [None]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(80)))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(learning_rate=0.001)
earlystop = EarlyStopping(monitor='val_accuracy', 
                          min_delta=0, 
                          patience=10, 
                          verbose=0, 
                          mode='auto')
model.compile(loss='categorical_crossentropy', 
              optimizer=adam, 
              metrics=['accuracy'])
history = model.fit(xs, 
                    ys, 
                    epochs=50, 
                    verbose=1,
                    validation_split=0.15,
                    callbacks=[earlystop], # stop after no improvement in validation accuracy
                    )
acc = round(history.history['accuracy'][-1],4)
vacc = round(history.history['val_accuracy'][-1],4)
epochs = history.params['epochs']

Save the model

In [None]:
tf.keras.models.save_model(model, f'/content/drive/MyDrive/data/model_tacc-{acc}_vacc-{vacc}_epochs-{epochs}')

Sample code to load the model

In [31]:
loaded_model = tf.keras.models.load_model(f'/content/drive/MyDrive/data/model_tacc-{acc}_vacc-{vacc}_epochs-{epochs}')

In [32]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()

Visualize the training and validation accuracy tradeoffs.

In [None]:
plot_graphs(history, 'accuracy')
plot_graphs(history, 'val_accuracy')


# Predictions

In [None]:
# these repeated import statements are unnecessary, just to show what one would need
# when running it in Flask
import tensorflow as tf
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


seed_text = "dream cloud cheese" # edit this to generate new poems!
saved_model_path = 'f/content/drive/MyDrive/data/sortedmodel_tacc-{acc}_vacc-{vacc}_epochs-{epochs}'
loaded_model = tf.keras.models.load_model(saved_model_path)
tokenizer = pickle.load(open('/content/drive/MyDrive/data/tokenizer.pkl','rb'))
max_sequence_len=75 # make sure to edit this if it changes

def line_breaker(s):
	""" breaks string s into lines mostly randomly"""
	s = s.split()
	l=len(s) # len of string
	output=""
	while l>0:
		x=random.randint(1,int(l/2)+1)
		tmp = s[:x]
		output += ' '.join(tmp) + "\n"
		s=s[x:]
		l -= x
	# line breaking between repeated words
	split = output.split(' ')
	final = ""
	for i in range(len(split)-1):
		if split[i]==split[i+1]:
			final += split[i] + '\n'
		else: 
			final += split[i] + ' '
	if split[-2]==split[-1]:
		final += "\n"+split[-1]
	else:
		final +=split[-1]
	final_output = final[0].upper()+final[1:]
	return final_output  

def pred_poem(seed_text,next_words=30,incl_title=True,):
	og_seed = seed_text
	for _ in range(next_words):
		token_list = tokenizer.texts_to_sequences([seed_text])[0]
		token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
		predicted = np.argmax(loaded_model.predict(token_list), axis=-1)
		output_word = ""
		for word, index in tokenizer.word_index.items():
			if index == predicted:
				output_word = word
				break
		seed_text += " " + output_word
	if not incl_title:
		seed_text = seed_text[len(og_seed)+1:]
	return line_breaker(seed_text)

print(pred_poem(seed_text))