# Text Generation

In this example we will train a model using datasets from Kaggle.

# Imports

In [3]:

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np 

In [4]:
print(tf.__version__)


NameError: ignored

In [None]:
!pip install tensorflow==1.14

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow==1.14
  Downloading tensorflow-1.14.0-cp37-cp37m-manylinux1_x86_64.whl (109.3 MB)
[K     |████████████████████████████████| 109.3 MB 1.1 MB/s 
Collecting keras-applications>=1.0.6
  Downloading Keras_Applications-1.0.8-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 8.5 MB/s 
Collecting tensorboard<1.15.0,>=1.14.0
  Downloading tensorboard-1.14.0-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 67.0 MB/s 
Collecting tensorflow-estimator<1.15.0rc0,>=1.14.0rc0
  Downloading tensorflow_estimator-1.14.0-py2.py3-none-any.whl (488 kB)
[K     |████████████████████████████████| 488 kB 72.1 MB/s 
Installing collected packages: tensorflow-estimator, tensorboard, keras-applications, tensorflow
  Attempting uninstall: tensorflow-estimator
    Found existing installation: tensorflow-estimator 2.9.0
    Uninstalling tensorflow

In [None]:
import tensorflow as tf


In [None]:
print(tf.__version__)

# Datasets

Kaggle provides a multitude of usefull datasets that we can use for our models.

Some examples are:

[Modern Renaissance Poetry](https://www.kaggle.com/ultrajack/modern-renaissance-poetry): Containing a dataset of poetry from Renaissance.

[Poe Short Stories](https://www.kaggle.com/leangab/poe-short-stories-corpuscsv): Containing short stories from Edgar Allan Poe.

[Song Lyrics](https://www.kaggle.com/paultimothymooney/poetry): With various TXT files with lyrics from song of various authors.



### Data used in this example

In this example we will use [lyrics from lady gaga from the Song Lyrics Kaggle dataset](https://www.kaggle.com/paultimothymooney/poetry?select=lady-gaga.txt) that was just mentioned.

In [None]:
tokenizer = Tokenizer()

data = open('alicia-keys.txt').read()

corpus = data.lower().split("\n")

tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

print(tokenizer.word_index)
print(total_words)


In [None]:
print(total_words)

# Create Training Data

This will split the input text into input sequences. It does it by breaking it into n-grams.


In [None]:
input_sequences = []
for line in corpus:
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
		n_gram_sequence = token_list[:i+1]
		input_sequences.append(n_gram_sequence)

# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# create predictors and label
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

# created categorigal on-hot encoding labels
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [None]:
print(tokenizer.word_index)

In [None]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=False
)


# Training the Model

In [None]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
layer = tf.keras.layers.Dropout(.2, input_shape=(2,))
model.add(Dense(total_words, activation='softmax'))
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
#earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')
history = model.fit(xs, ys, epochs=40, verbose=1)
#print model.summary()
print(model)



In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()

In [None]:
plot_graphs(history, 'accuracy')


# Predict next words

Now the following function uses the trained model to predict the following word taking as seed the previous ones.

In [None]:
seed_text = "I want "
next_words = 50
  
for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	predicted = model.predict_classes(token_list, verbose=0)
	
	output_word = ""
	for word, index in tokenizer.word_index.items():
		if index == predicted:
			output_word = word
			break
	seed_text += " " + output_word

print(seed_text)

# Conclusion

Did you get a nice result from this training dataset? Why?
Take a look inside the dataset and try to see its format, and maybe infer the result meaning.
Would it work better with a dataset from a more comple writer (one that don't normally repeat words and beats inside its lyrics)?
Try it out with other datasets!