# TEXT GENERATION

Import the required libraries

In [1]:
import numpy as np
import nltk
from nltk.corpus import genesis #Used to load the corpus
from nltk.tokenize import word_tokenize #
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import random
import string

Download the corpus

In [2]:
nltk.download('genesis')

[nltk_data] Downloading package genesis to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package genesis is already up-to-date!


True

Pre-processsing on the corpus

In [3]:
# Load the genesis corpus
genesis_raw = genesis.raw('english-kjv.txt')
print(genesis_raw)

In the beginning God created the heaven and the earth.
And the earth was without form, and void; and darkness was
upon the face of the deep. And the Spirit of God moved upon
the face of the waters.
And God said, Let there be light: and there was light.
And God saw the light, that it was good: and God divided the
light from the darkness.
And God called the light Day, and the darkness he called
Night. And the evening and the morning were the first day.
And God said, Let there be a firmament in the midst of the
waters, and let it divide the waters from the waters.
And God made the firmament, and divided the waters which were
under the firmament from the waters which were above the
firmame and it was so.
And God called the firmament Heaven. And the evening and the
morning were the second day.
And God said, Let the waters under the heaven be gathered
together unto one place, and let the dry land appe and it
was so.
And God called the dry land Earth; and the gathering together
of the waters 

In [4]:
with open("genesis.txt", "w") as output:
    output.write(str(genesis_raw))

In [5]:
print(len(genesis_raw))

195515


In [6]:
#Convert all the words in the corpus to lowercase (Preprocessing- Step1)
genesis_text = genesis_raw.lower()
print(genesis_text[:1000])

in the beginning god created the heaven and the earth.
and the earth was without form, and void; and darkness was
upon the face of the deep. and the spirit of god moved upon
the face of the waters.
and god said, let there be light: and there was light.
and god saw the light, that it was good: and god divided the
light from the darkness.
and god called the light day, and the darkness he called
night. and the evening and the morning were the first day.
and god said, let there be a firmament in the midst of the
waters, and let it divide the waters from the waters.
and god made the firmament, and divided the waters which were
under the firmament from the waters which were above the
firmame and it was so.
and god called the firmament heaven. and the evening and the
morning were the second day.
and god said, let the waters under the heaven be gathered
together unto one place, and let the dry land appe and it
was so.
and god called the dry land earth; and the gathering together
of the waters 

In [7]:
#Removing Punctuation marks from the corpus (Preprocessing- step 2)
text = ''.join(c for c in genesis_text if c not in '.,;:?!')
print(text)

in the beginning god created the heaven and the earth
and the earth was without form and void and darkness was
upon the face of the deep and the spirit of god moved upon
the face of the waters
and god said let there be light and there was light
and god saw the light that it was good and god divided the
light from the darkness
and god called the light day and the darkness he called
night and the evening and the morning were the first day
and god said let there be a firmament in the midst of the
waters and let it divide the waters from the waters
and god made the firmament and divided the waters which were
under the firmament from the waters which were above the
firmame and it was so
and god called the firmament heaven and the evening and the
morning were the second day
and god said let the waters under the heaven be gathered
together unto one place and let the dry land appe and it
was so
and god called the dry land earth and the gathering together
of the waters called he se and god saw 

In [8]:
with open("genesis_preprocessed.txt", "w") as output:
    output.write(str(text))

In [9]:
print(len(text))

189522


In [10]:
#Tokenize the text into words (Preprocessing- Step 3)
genesis_words = word_tokenize(text)
print(genesis_words)

['in', 'the', 'beginning', 'god', 'created', 'the', 'heaven', 'and', 'the', 'earth', 'and', 'the', 'earth', 'was', 'without', 'form', 'and', 'void', 'and', 'darkness', 'was', 'upon', 'the', 'face', 'of', 'the', 'deep', 'and', 'the', 'spirit', 'of', 'god', 'moved', 'upon', 'the', 'face', 'of', 'the', 'waters', 'and', 'god', 'said', 'let', 'there', 'be', 'light', 'and', 'there', 'was', 'light', 'and', 'god', 'saw', 'the', 'light', 'that', 'it', 'was', 'good', 'and', 'god', 'divided', 'the', 'light', 'from', 'the', 'darkness', 'and', 'god', 'called', 'the', 'light', 'day', 'and', 'the', 'darkness', 'he', 'called', 'night', 'and', 'the', 'evening', 'and', 'the', 'morning', 'were', 'the', 'first', 'day', 'and', 'god', 'said', 'let', 'there', 'be', 'a', 'firmament', 'in', 'the', 'midst', 'of', 'the', 'waters', 'and', 'let', 'it', 'divide', 'the', 'waters', 'from', 'the', 'waters', 'and', 'god', 'made', 'the', 'firmament', 'and', 'divided', 'the', 'waters', 'which', 'were', 'under', 'the', 'f

Creating the Markov Model for text-generation

In [11]:
# Define the order of the Markov model (1 = unigram, 2 = bigram, etc.)
order = 2

# Create the transition matrix for the Markov model
# A row of the transition matrix contains all the words with their frequencies which follows a particular word.
def create_transition_matrix(order, corpus):
    transition_matrix = {}
    for i in range(len(corpus) - order):
        context = tuple(corpus[i:i+order])
        next_word = corpus[i+order]
        if context in transition_matrix:
            transition_matrix[context][next_word] = transition_matrix[context].get(next_word, 0) + 1
        else:
            transition_matrix[context] = {next_word: 1}
    return transition_matrix

transition_matrix = create_transition_matrix(order, genesis_words)

with open("transition_matrix.txt", "w") as output:
    output.write(str(transition_matrix))

Creating the LSTM model for text-generation

In [12]:
# Create the LSTM model
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text]) #Assigns a token number for every word in the corpus 
corpus_seq = tokenizer.texts_to_sequences([text])[0] #Represents the corpus in form of token numbers

In [13]:
tokenizer.word_index

{'and': 1,
 'the': 2,
 'of': 3,
 'he': 4,
 'his': 5,
 'to': 6,
 'in': 7,
 'unto': 8,
 'that': 9,
 'i': 10,
 'said': 11,
 'him': 12,
 'a': 13,
 'my': 14,
 'for': 15,
 'was': 16,
 'it': 17,
 'with': 18,
 'thou': 19,
 'me': 20,
 'is': 21,
 'thy': 22,
 'shall': 23,
 'be': 24,
 'thee': 25,
 'they': 26,
 'all': 27,
 'them': 28,
 'god': 29,
 'not': 30,
 'lord': 31,
 'which': 32,
 'will': 33,
 'land': 34,
 'came': 35,
 'her': 36,
 'she': 37,
 'father': 38,
 'jacob': 39,
 'were': 40,
 'from': 41,
 'their': 42,
 'this': 43,
 'upon': 44,
 'son': 45,
 'had': 46,
 'joseph': 47,
 'sons': 48,
 'have': 49,
 'up': 50,
 'there': 51,
 'are': 52,
 'when': 53,
 'behold': 54,
 'abraham': 55,
 'as': 56,
 'earth': 57,
 'after': 58,
 'went': 59,
 'out': 60,
 'we': 61,
 'man': 62,
 'but': 63,
 'these': 64,
 'years': 65,
 'wife': 66,
 'name': 67,
 'called': 68,
 'ye': 69,
 'let': 70,
 'us': 71,
 'every': 72,
 'before': 73,
 'now': 74,
 'into': 75,
 'by': 76,
 'also': 77,
 'you': 78,
 'hand': 79,
 'pass': 80,
 'h

In [14]:
print(corpus_seq)

[7, 2, 657, 29, 376, 2, 186, 1, 2, 57, 1, 2, 57, 16, 437, 1531, 1, 1532, 1, 761, 16, 44, 2, 132, 3, 2, 591, 1, 2, 658, 3, 29, 1122, 44, 2, 132, 3, 2, 170, 1, 29, 11, 70, 51, 24, 377, 1, 51, 16, 377, 1, 29, 114, 2, 377, 9, 17, 16, 141, 1, 29, 406, 2, 377, 41, 2, 761, 1, 29, 68, 2, 377, 96, 1, 2, 761, 4, 68, 188, 1, 2, 407, 1, 2, 235, 40, 2, 278, 96, 1, 29, 11, 70, 51, 24, 13, 479, 7, 2, 592, 3, 2, 170, 1, 70, 17, 659, 2, 170, 41, 2, 170, 1, 29, 97, 2, 479, 1, 406, 2, 170, 32, 40, 264, 2, 479, 41, 2, 170, 32, 40, 378, 2, 1533, 1, 17, 16, 94, 1, 29, 68, 2, 479, 186, 1, 2, 407, 1, 2, 235, 40, 2, 306, 96, 1, 29, 11, 70, 2, 170, 264, 2, 186, 24, 297, 279, 8, 99, 135, 1, 70, 2, 762, 34, 1534, 1, 17, 16, 94, 1, 29, 68, 2, 762, 34, 57, 1, 2, 1123, 279, 3, 2, 170, 68, 4, 763, 1, 29, 114, 9, 17, 16, 141, 1, 29, 11, 70, 2, 57, 127, 146, 1124, 2, 535, 660, 112, 1, 2, 438, 230, 660, 438, 58, 5, 327, 242, 112, 21, 7, 1125, 44, 2, 439, 1, 17, 16, 94, 1, 2, 57, 109, 146, 1124, 1, 535, 660, 112, 58, 5, 

In [15]:
seq_length = 20
sequences = []

#Divides the corpus into sequences of length 20

for i in range(seq_length, len(corpus_seq)):
    seq = corpus_seq[i-seq_length:i+1]
    sequences.append(seq)
    
sequences = np.array(sequences)

In [16]:
print(sequences)

[[   7    2  657 ...    1  761   16]
 [   2  657   29 ...  761   16   44]
 [ 657   29  376 ...   16   44    2]
 ...
 [  94   47  169 ...    7   13 2680]
 [  47  169  516 ...   13 2680    7]
 [ 169  516   89 ... 2680    7   93]]


In [17]:
print(len(sequences))

38220


In [18]:
x, y = sequences[:,:-1], sequences[:,-1]
vocab_size = len(tokenizer.word_index) + 1 #returns the no of unique words in the corpus
print(vocab_size)
y = to_categorical(y, num_classes=vocab_size)

2681


In [19]:
print(x)

[[   7    2  657 ... 1532    1  761]
 [   2  657   29 ...    1  761   16]
 [ 657   29  376 ...  761   16   44]
 ...
 [  94   47  169 ...  149    7   13]
 [  47  169  516 ...    7   13 2680]
 [ 169  516   89 ...   13 2680    7]]


In [20]:
print(y)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [21]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [22]:
# Define a function to generate text using the LSTM model
def generate_text_lstm(model, tokenizer, seed_text, num_words):
    for _ in range(num_words):
        encoded = tokenizer.texts_to_sequences([seed_text])[0] #Encode the starting words to sequence
        print(encoded)
        
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre') 
        pred=model.predict(encoded) 
        y_pred=np.argmax(pred,axis=1) #Predict the next word and add it to the sequence
        print(y_pred)
        
        predicted_word = ''
        for word, index in tokenizer.word_index.items():
            if index == y_pred:
                predicted_word = word
                break
        seed_text += ' ' + predicted_word
    return seed_text

Generating the text

In [28]:
# Choose a random starting context
context = random.choice(list(transition_matrix.keys()))
print(len(context))
print(context)

2
('her', 'days')


In [29]:
# Define a function to generate text from the HMM
def generate_text(transition_matrix, order, length, context):
    # Generate the text
    text = list(context)
    print(text)
    for i in range(length):
        if context in transition_matrix:
            next_word = max(transition_matrix[context], key=transition_matrix[context].get)
            text.append(next_word)
            context = tuple(text[-order:])
    return ' '.join(text)

In [30]:
# Generate a sentence of 20 words each
sentence = generate_text(transition_matrix, order, 20, context)
print(sentence)

['her', 'days']
her days to be with you and ye shall not be numbered for multitude and the lord god of my father and


In [31]:
# Generate text using the LSTM model
seed_text = ''
for i in range(0, len(context)):
    seed_text = seed_text + context[i] + ' '

print(seed_text)
lstm_text = generate_text_lstm(model, tokenizer, seed_text, num_words=20)

her days 
[36, 110]
[700]
[36, 110, 700]
[700]
[36, 110, 700, 700]
[700]
[36, 110, 700, 700, 700]
[1786]
[36, 110, 700, 700, 700, 1786]
[1786]
[36, 110, 700, 700, 700, 1786, 1786]
[1934]
[36, 110, 700, 700, 700, 1786, 1786, 1934]
[1934]
[36, 110, 700, 700, 700, 1786, 1786, 1934, 1934]
[647]
[36, 110, 700, 700, 700, 1786, 1786, 1934, 1934, 647]
[647]
[36, 110, 700, 700, 700, 1786, 1786, 1934, 1934, 647, 647]
[647]
[36, 110, 700, 700, 700, 1786, 1786, 1934, 1934, 647, 647, 647]
[647]
[36, 110, 700, 700, 700, 1786, 1786, 1934, 1934, 647, 647, 647, 647]
[2571]
[36, 110, 700, 700, 700, 1786, 1786, 1934, 1934, 647, 647, 647, 647, 2571]
[2571]
[36, 110, 700, 700, 700, 1786, 1786, 1934, 1934, 647, 647, 647, 647, 2571, 2571]
[647]
[36, 110, 700, 700, 700, 1786, 1786, 1934, 1934, 647, 647, 647, 647, 2571, 2571, 647]
[1708]
[36, 110, 700, 700, 700, 1786, 1786, 1934, 1934, 647, 647, 647, 647, 2571, 2571, 647, 1708]
[1708]
[36, 110, 700, 700, 700, 1786, 1786, 1934, 1934, 647, 647, 647, 647, 2571, 2

In [32]:
print("LSTM Text:\n", lstm_text)

LSTM Text:
 her days  lands lands lands slimepits slimepits lovest lovest dinah dinah dinah dinah activity activity dinah pathrusim pathrusim pathrusim pathrusim pathrusim pathrusim
