In [7]:
import random
import numpy as np 
import pandas as pd
from nltk.tokenize import RegexpTokenizer

In [8]:
import tensorflow
from tensorflow import keras
from keras.optimizers import RMSprop
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Activation

2023-04-28 13:52:29.570113: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# read file
with open("/Users/jacobhuckleberry/Desktop/Code/Text Generation/data/gospel_of_buddha.txt", "r") as file:
    raw_text = file.read()

# creating a list of all the text values from text column
text_list = raw_text.split()
print(text_list[-40:])
print(len(text_list))

# remove verse numbers from text list
only_text_list = []
for word in text_list:
    if not word.isdigit():
        only_text_list.append(word)

# test
print(only_text_list[-40:])
print(len(only_text_list))

# joining all text with a space in between
buddha_text_joined = " ".join(only_text_list)


['is', 'one', 'and', 'the', 'same', 'at', 'all', 'times', 'and', 'in', 'every', 'place.', '29', 'Truth', 'teaches', 'us', 'the', 'noble', 'eightfold', 'path', 'of', 'righteousness,', 'and', 'it', 'is', 'a', 'straight', 'path', 'easily', 'found', 'by', 'the', 'truth-loving.', 'Happy', 'are', 'those', 'who', 'walk', 'in', 'it.']
68744
['truth', 'is', 'one', 'and', 'the', 'same', 'at', 'all', 'times', 'and', 'in', 'every', 'place.', 'Truth', 'teaches', 'us', 'the', 'noble', 'eightfold', 'path', 'of', 'righteousness,', 'and', 'it', 'is', 'a', 'straight', 'path', 'easily', 'found', 'by', 'the', 'truth-loving.', 'Happy', 'are', 'those', 'who', 'walk', 'in', 'it.']
67193


<b>Building Training Data</b>

In [5]:
# creating a smaller subset of test
partial_text = buddha_text_joined[:34000]

In [10]:
"""
instantiate tokenizer
r for raw String
\w matchs alpanumeric characters
+ quantifier states the preceding character "\w" must match one or more occurences
""" 
tokenizer = RegexpTokenizer(r"\w+")
tokens = tokenizer.tokenize(buddha_text_joined.lower())

In [11]:
# get unique words with numpy unique function
unique_tokens = np.unique(tokens)
# create a dictionary from enumerate function as token: id,
unique_tokens_index = {token: idx for idx, token in enumerate(unique_tokens)}

In [12]:
# want to look at the last n_words to get context from the sentence to predict the next word
n_words = 10
input_words = []
next_words = []

# from joined_text we add the past n words to input_words list to get the context of sentence
# append the 11th next word into next_words list
for i in range(len(tokens) - n_words):
    input_words.append(tokens[i : i + n_words])
    next_words.append(tokens[i + n_words])

In [13]:
"""
numpy arrays (matrices) are filled with zeros with the shape of the lists and specified data type
Utilizes to triain the model with binary data
X has 3D array with boolean data type
y has 2D array with boolean data type
"""
# initializing the structure or dimensions of the X and y training data
X = np.zeros((len(input_words), n_words, len(unique_tokens)), dtype=bool)
y = np.zeros((len(next_words), len(unique_tokens)), dtype=bool)

In [14]:
# setting the target word for X and y np array to 1 (whereas all others are 0)
# iterating into each sample text (10 words)
for i, words in enumerate(input_words):
    #iterating over each word within the whole sample
    for j, word in enumerate(words):
        #enter into X numpy array and set the position of word to 1 (or True)
        X[i, j, unique_tokens_index[word]] = 1
    #enter into y numpy array and set target position to 1 (ot True)
    y[i, unique_tokens_index[next_words[i]]] = 1


<b>Building Model</b>

In [15]:
""" 
start with Long-Short-Term Memory model as recurent neural network (RNN)
contains 128 neurons 
has a 3D shape -- timesteps (input-sequence) n_words and features (# of dimensions) len(unique_tokens)
return_sequences parameter is true since there is another LSTM layer next within RNN
"""
model = Sequential()
model.add(LSTM(128, input_shape=(n_words, len(unique_tokens)), return_sequences=True))

# next LSTM layer with 128 neurons
model.add(LSTM(128))

# next Dense fully-connected layer with len(unique_tokens) amount of neurons
model.add(Dense(len(unique_tokens)))

# last Activation layer is softmax to convert output to probability distribution 
model.add(Activation("softmax"))

2023-04-28 13:52:49.971439: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-28 13:52:49.973407: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-28 13:52:49.974880: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [16]:
# model configuration
model.compile(
    # loss function for multi-class classifi
    loss = "categorical_crossentropy",
    # learning rate of 1%
    optimizer = tensorflow.keras.optimizers.RMSprop(learning_rate=0.01), 
    # metrics to evaluate during training
    metrics = ["accuracy"]
)

"""
train model with fit function
X as input, y as target
128 samples at one time (conserve memory) over 30 iterations
shuffling the samples within each epoch
"""
model.fit(X, y, batch_size=128, epochs=30, shuffle=True)

Epoch 1/30


2023-04-28 13:52:50.710211: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-28 13:52:50.712076: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-28 13:52:50.713972: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f8d910cbfa0>

In [17]:
model.save("text_gen.h5")

In [18]:
model = load_model("text_gen.h5")

2023-04-28 14:42:22.862312: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-28 14:42:22.864388: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-28 14:42:22.865896: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [19]:
def predict_next_words(input_text, n_best):
    input_text = input_text.lower()

    """ create 3D np array of zeros
        1st dimension of 1 for the single sample with n amount of words
        2nd dimension max length of input sequence
        3rd dimension number of unique words or tokens in the dataset
    """ 
    X = np.zeros((1, n_words, len(unique_tokens)))

    # one hot encoding, finding position of words and setting them to 1 in np array
    for i, word in enumerate(input_text.split()):
        X[0, i, unique_tokens_index[word]] = 1
    
    # prediction function get predicted output from input X and gets the first value
    predictions = model.predict(X)[0]
    # partition n_best from the predictions array 
    return np.argpartition(predictions, -n_best)[-n_best:]

In [20]:
possible = predict_next_words("Consciousness at its heart is a part of the living", 5)
possible

2023-04-28 14:42:23.704544: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-04-28 14:42:23.706184: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-04-28 14:42:23.707859: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



array([2809, 1769, 3645, 1151,  500])

In [21]:
print([unique_tokens[idx] for idx in possible])

['kinds', 'evils', 'passion', 'creatures', 'beings']


In [22]:
def generate_text(input_text, text_length, creativity=3):
    word_sequence = input_text.split()
    current = 0
    for _ in range(text_length):
        sub_sequence = " ".join(tokenizer.tokenize(" ".join(word_sequence).lower())[current:current+n_words])
        try:
            choice = unique_tokens[random.choice(predict_next_words(sub_sequence, creativity))]
        except:
            random.choice(unique_tokens)
        word_sequence.append(choice)
        current += 1
    return " ".join(word_sequence)

In [23]:
generate_text("Consciousness at its heart is a part of the living", 75, 1)



'Consciousness at its heart is a part of the living creatures that you have proved how control if they are not and blessed can be able to live in nature and day hence then two monkeys but the object of the most extremes after it happened that the king observed that of all the sand thinking of mind and the city were to be admitted to the buddha where the blessed one was there she had seen his see the buddha and they approached him'