# Text Generation with Python and Keras
    Goal :
    - Process Text
    - Clean Text
    - Tokenize the Text and create Sequences with Keras

In [2]:
def read_file(filepath):
    with open(filepath) as f:
      str_text = f.read()
        
    return str_text

In [3]:
# read_file('moby_dick_four_chapters.txt')

In [4]:
import spacy

In [5]:
nlp = spacy.load('en_core_web_sm',disable=['parser','tagger','ner'])

In [6]:
nlp.max_length = 1198623

In [7]:
def separate_punc(doc_text):
    exclude = {'#', '$', '\n', '.', ',', '!', '?', ':', ';', '(', ')', '[', ']', '{', '}', '"', "'", '-', '_', '/', '\\', '|', '@', '%', '^', '&', '*', '~', '`', '+'}
    return [token.text.lower() for token in nlp(doc_text) if token.text not in exclude]

In [8]:
d = read_file('moby_dick_four_chapters.txt')

In [9]:
tokens = separate_punc(d)



In [10]:
len(tokens)

11851

In [11]:
train_len = 25 + 1
text_sequences = []
for i in range(train_len,len(tokens)):
    seq= tokens[i-train_len:i]
    text_sequences.append(seq)

In [12]:
type(text_sequences)

list

In [13]:
# text_sequences[1]
' '.join(text_sequences[0])

'call me ishmael   some years ago -- never mind how long precisely -- having little or no money in my purse and nothing particular to'

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [15]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [16]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [17]:
# sequences[1]

In [18]:
# tokenizer.index_word

In [19]:
 # tokenizer.word_counts

In [20]:
vocabulary_size = len(tokenizer.word_counts)

In [21]:
vocabulary_size

2721

In [22]:
import numpy as np

In [23]:
sequences = np.array(sequences)

In [24]:
sequences

array([[ 959,   16,  265, ...,  152,  261,    7],
       [  16,  265,    4, ...,  261,    7,  960],
       [ 265,    4,   54, ...,    7,  960,   16],
       ...,
       [ 955,   13,  168, ...,  264,   56,    2],
       [  13,  168, 2716, ...,   56,    2, 2721],
       [ 168, 2716,    3, ...,    2, 2721,   29]])

### From this point, we are going to work on the next steps :
    - Create the LSTM-Based Model
    - Split the Data into Features and Labels
        - X Features (First n words of Sequence)
        - Y label (next word after the sequence)
    - Fit the Model

In [26]:
from tensorflow.keras.utils import to_categorical

In [27]:
X = sequences[:,:-1]

In [28]:
y = sequences[:,-1]

In [29]:
y =  to_categorical(y,num_classes=vocabulary_size+1)

In [30]:
seq_len = X.shape[1]

In [31]:
X.shape

(11825, 25)

In [32]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [33]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
   
    model.add(Embedding(input_dim=vocabulary_size, output_dim=100))  # Removed input_length
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50, activation='relu'))
    model.build(input_shape=(None, seq_len))  # Optional
    model.add(Dense(vocabulary_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.summary()
  
    return model

In [34]:
# Correctly aligned call
model = create_model(vocabulary_size + 1, seq_len)



In [35]:
from pickle import dump,load

In [36]:
model.fit(X,y,batch_size=128,epochs=2,verbose=1)

Epoch 1/2
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 39ms/step - accuracy: 0.0231 - loss: 7.3814
Epoch 2/2
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 44ms/step - accuracy: 0.0495 - loss: 6.2970


<keras.src.callbacks.history.History at 0x21e0b7e95e0>

In [37]:
model.save('my_model.keras')

In [38]:
dump(tokenizer,open('my_simpletokenizer','wb'))

In [39]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [40]:
def generate_text(model,tokenizer,seq_len,seed_text,num_gen_words):
    output_text = []
    input_text =seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pred_word_ind = model.predict(pad_encoded, verbose=0).argmax(axis=-1)[0]
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text +=' '+pred_word
        output_text.append(pred_word)
    return ' '.join(output_text)

In [41]:
text_sequences[0]

['call',
 'me',
 'ishmael',
 ' ',
 'some',
 'years',
 'ago',
 '--',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 '--',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to']

In [44]:
import random
random.seed(101)
random_pick = random.randint(0, len(text_sequences) - 1)

In [46]:
random_seed_text = text_sequences[random_pick]

In [48]:
random_seed_text

['so',
 'long',
 'been',
 'bound',
 '\n\n',
 'but',
 'the',
 'interval',
 'i',
 'spent',
 'in',
 'deliberating',
 'what',
 'to',
 'say',
 'was',
 'a',
 'fatal',
 'one',
 ' ',
 'taking',
 'up',
 'his',
 'tomahawk',
 'from',
 'the']

In [50]:
seed_text = ' '.join(random_seed_text).replace('\n', '')

In [52]:
seed_text

'so long been bound  but the interval i spent in deliberating what to say was a fatal one   taking up his tomahawk from the'

In [54]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

'the the the the the the the the the the the the the the the the the the the the the the the the the'