<a href="https://colab.research.google.com/github/mr-alamdari/NLP-Text-Generation/blob/main/NLP_Text_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import spacy

In [2]:
def read_file(path):
  with open(path) as f:
    my_str = f.read()
  return my_str

In [3]:
!wget https://raw.githubusercontent.com/mr-alamdari/NLP-Text-Generation/main/moby_dick_four_chapters.txt

--2022-05-02 04:47:50--  https://raw.githubusercontent.com/mr-alamdari/NLP-Text-Generation/main/moby_dick_four_chapters.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 62685 (61K) [text/plain]
Saving to: ‘moby_dick_four_chapters.txt’


2022-05-02 04:47:50 (15.7 MB/s) - ‘moby_dick_four_chapters.txt’ saved [62685/62685]



In [4]:
mobydic = read_file('moby_dick_four_chapters.txt')

In [5]:
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])

In [6]:
nlp.max_length = 1198623

In [7]:
seperate_punc = lambda doc: [token.text.lower() for token in nlp(doc) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n']

In [8]:
tokens = seperate_punc(mobydic)

In [9]:
tokens[:20]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and']

In [10]:
len(tokens)

11338

In [11]:
train_len = 25 + 1 # 25 training words , then one target word

text_sequences = [tokens[i-train_len: i] for i in range(train_len, len(tokens))]

In [12]:
' '.join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [13]:
' '.join(text_sequences[1])

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [14]:
' '.join(text_sequences[2])

'ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i'

In [15]:
' '.join(text_sequences[3])

'some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore i thought'

In [16]:
import tensorflow as tf

In [17]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()

In [18]:
tokenizer.fit_on_texts(text_sequences)

In [19]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [20]:
sequences = np.array(sequences)

In [21]:
print(sequences[0])

[ 956   14  263   51  261  408   87  219  129  111  954  260   50   43
   38  315    7   23  546    3  150  259    6 2712   14   24]


In [22]:
print(sequences[1])

[  14  263   51  261  408   87  219  129  111  954  260   50   43   38
  315    7   23  546    3  150  259    6 2712   14   24  957]


In [23]:
print(sequences[2])

[ 263   51  261  408   87  219  129  111  954  260   50   43   38  315
    7   23  546    3  150  259    6 2712   14   24  957    5]


In [24]:
print(sequences[3])

[  51  261  408   87  219  129  111  954  260   50   43   38  315    7
   23  546    3  150  259    6 2712   14   24  957    5   60]


In [25]:
for i in sequences[0]:
  print(i, tokenizer.index_word[i])

956 call
14 me
263 ishmael
51 some
261 years
408 ago
87 never
219 mind
129 how
111 long
954 precisely
260 having
50 little
43 or
38 no
315 money
7 in
23 my
546 purse
3 and
150 nothing
259 particular
6 to
2712 interest
14 me
24 on


In [26]:
for i, count in enumerate(sorted(tokenizer.word_counts.items(), key=lambda x: -x[1])):
  if i == 20:
    break
  print(i, count)

0 ('the', 15540)
1 ('a', 10377)
2 ('and', 9646)
3 ('of', 8287)
4 ('i', 7150)
5 ('to', 6497)
6 ('in', 5647)
7 ('it', 4238)
8 ('that', 3770)
9 ('he', 3247)
10 ('his', 3139)
11 ('was', 2886)
12 ('but', 2652)
13 ('me', 2471)
14 ('with', 2392)
15 ('as', 2366)
16 ('at', 2184)
17 ('this', 2158)
18 ('you', 2158)
19 ('is', 1950)


In [27]:
vocab_size=len(tokenizer.word_counts)

In [28]:
vocab_size

2717

In [29]:
x = sequences[:, :-1]

In [30]:
y = sequences[:, -1]

In [31]:
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size+1)

In [32]:
seq_len = x.shape[1]

In [33]:
def create_model(vocab_size, seq_len):
  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Embedding(vocab_size, seq_len, input_length=seq_len))
  model.add(tf.keras.layers.LSTM(seq_len, return_sequences=True))
  model.add(tf.keras.layers.LSTM(seq_len))
  model.add(tf.keras.layers.Dense(seq_len, activation='relu'))
  model.add(tf.keras.layers.Dense(vocab_size, activation='softmax'))
  model.compile(loss='categorical_crossentropy',
                optimizer = 'adam',
                metrics = ['accuracy'])

  model.summary()
  return model

In [34]:
model = create_model(vocab_size+1, seq_len)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 25)            67950     
                                                                 
 lstm (LSTM)                 (None, 25, 25)            5100      
                                                                 
 lstm_1 (LSTM)               (None, 25)                5100      
                                                                 
 dense (Dense)               (None, 25)                650       
                                                                 
 dense_1 (Dense)             (None, 2718)              70668     
                                                                 
Total params: 149,468
Trainable params: 149,468
Non-trainable params: 0
_________________________________________________________________


In [35]:
from pickle import dump, load

In [36]:
model.fit(x, y, batch_size=32, epochs=100, verbose=2)

Epoch 1/100
354/354 - 12s - loss: 6.7936 - accuracy: 0.0474 - 12s/epoch - 35ms/step
Epoch 2/100
354/354 - 8s - loss: 6.2924 - accuracy: 0.0529 - 8s/epoch - 24ms/step
Epoch 3/100
354/354 - 8s - loss: 6.1731 - accuracy: 0.0529 - 8s/epoch - 24ms/step
Epoch 4/100
354/354 - 8s - loss: 6.0661 - accuracy: 0.0530 - 8s/epoch - 23ms/step
Epoch 5/100
354/354 - 9s - loss: 5.9521 - accuracy: 0.0582 - 9s/epoch - 24ms/step
Epoch 6/100
354/354 - 8s - loss: 5.8163 - accuracy: 0.0655 - 8s/epoch - 23ms/step
Epoch 7/100
354/354 - 8s - loss: 5.7093 - accuracy: 0.0659 - 8s/epoch - 24ms/step
Epoch 8/100
354/354 - 8s - loss: 5.6243 - accuracy: 0.0678 - 8s/epoch - 24ms/step
Epoch 9/100
354/354 - 8s - loss: 5.5544 - accuracy: 0.0690 - 8s/epoch - 24ms/step
Epoch 10/100
354/354 - 8s - loss: 5.4928 - accuracy: 0.0705 - 8s/epoch - 24ms/step
Epoch 11/100
354/354 - 8s - loss: 5.4256 - accuracy: 0.0703 - 8s/epoch - 24ms/step
Epoch 12/100
354/354 - 8s - loss: 5.3535 - accuracy: 0.0735 - 8s/epoch - 24ms/step
Epoch 13/10

<keras.callbacks.History at 0x7f3b106bd9d0>

In [42]:
model.save('mobydic_model.h5')

In [38]:
dump(tokenizer, open('simpleTokenizer', 'wb'))

In [39]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
  output_text = []
  input_text = seed_text
  for i in range(num_gen_words):
    encoded_text = tokenizer.texts_to_sequences([input_text])[0]
    pad_encoded = tf.keras.preprocessing.sequence.pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
    pred_word = model.predict(pad_encoded, verbose=0)[0]
    pred_word_ind = np.argmax(pred_word)
    pred_word = tokenizer.index_word[pred_word_ind]
    input_text += ' '+pred_word
    output_text.append(pred_word)
  return ' '.join(output_text)

In [40]:
num_gen_words = 20
n = np.random.randint(0, len(text_sequences))
generate_text(model, tokenizer, seq_len, ' '.join(text_sequences[n]), num_gen_words)

'alive to the comical predicament began or picked out to limping to the neighbors and at last now not do'

In [41]:
model.fit(x, y, batch_size=32, epochs=100, verbose=2)

Epoch 1/100
354/354 - 8s - loss: 2.4358 - accuracy: 0.4231 - 8s/epoch - 24ms/step
Epoch 2/100
354/354 - 8s - loss: 2.4075 - accuracy: 0.4249 - 8s/epoch - 24ms/step
Epoch 3/100
354/354 - 8s - loss: 2.3861 - accuracy: 0.4316 - 8s/epoch - 24ms/step
Epoch 4/100
354/354 - 8s - loss: 2.3434 - accuracy: 0.4422 - 8s/epoch - 24ms/step
Epoch 5/100
354/354 - 8s - loss: 2.3309 - accuracy: 0.4417 - 8s/epoch - 24ms/step
Epoch 6/100
354/354 - 8s - loss: 2.3072 - accuracy: 0.4516 - 8s/epoch - 24ms/step
Epoch 7/100
354/354 - 8s - loss: 2.3001 - accuracy: 0.4500 - 8s/epoch - 24ms/step
Epoch 8/100
354/354 - 8s - loss: 2.2970 - accuracy: 0.4460 - 8s/epoch - 24ms/step
Epoch 9/100
354/354 - 8s - loss: 2.2842 - accuracy: 0.4538 - 8s/epoch - 24ms/step
Epoch 10/100
354/354 - 8s - loss: 2.2797 - accuracy: 0.4533 - 8s/epoch - 24ms/step
Epoch 11/100
354/354 - 8s - loss: 2.2579 - accuracy: 0.4556 - 8s/epoch - 24ms/step
Epoch 12/100
354/354 - 8s - loss: 2.2407 - accuracy: 0.4646 - 8s/epoch - 24ms/step
Epoch 13/100


<keras.callbacks.History at 0x7f3b07beb890>

In [44]:
num_gen_words = 40
n = np.random.randint(0, len(text_sequences))
generate_text(model, tokenizer, seq_len, ' '.join(text_sequences[n]), num_gen_words)

'other things at the same head do the before the most leviathan orchard will lead like my eyes who almost money would properly flukes that was a sort of glass bought up a newfoundland dog just and queequeg was found'

In [45]:
model.fit(x, y, batch_size=32, epochs=100, verbose=2)

Epoch 1/100
354/354 - 9s - loss: 1.3127 - accuracy: 0.6707 - 9s/epoch - 25ms/step
Epoch 2/100
354/354 - 9s - loss: 1.2780 - accuracy: 0.6813 - 9s/epoch - 24ms/step
Epoch 3/100
354/354 - 9s - loss: 1.2495 - accuracy: 0.6890 - 9s/epoch - 24ms/step
Epoch 4/100
354/354 - 8s - loss: 1.2574 - accuracy: 0.6873 - 8s/epoch - 24ms/step
Epoch 5/100
354/354 - 9s - loss: 1.2514 - accuracy: 0.6861 - 9s/epoch - 24ms/step
Epoch 6/100
354/354 - 9s - loss: 1.2328 - accuracy: 0.6909 - 9s/epoch - 24ms/step
Epoch 7/100
354/354 - 8s - loss: 1.2229 - accuracy: 0.6917 - 8s/epoch - 24ms/step
Epoch 8/100
354/354 - 9s - loss: 1.2226 - accuracy: 0.6921 - 9s/epoch - 24ms/step
Epoch 9/100
354/354 - 9s - loss: 1.2343 - accuracy: 0.6914 - 9s/epoch - 24ms/step
Epoch 10/100
354/354 - 9s - loss: 1.2036 - accuracy: 0.7026 - 9s/epoch - 24ms/step
Epoch 11/100
354/354 - 9s - loss: 1.2078 - accuracy: 0.6954 - 9s/epoch - 24ms/step
Epoch 12/100
354/354 - 9s - loss: 1.2246 - accuracy: 0.6925 - 9s/epoch - 25ms/step
Epoch 13/100


<keras.callbacks.History at 0x7f3b07b22050>

In [46]:
model.save('mobydic_model.h5')
num_gen_words = 40
n = np.random.randint(0, len(text_sequences))
generate_text(model, tokenizer, seq_len, ' '.join(text_sequences[n]), num_gen_words)

"altogether unwarranted but what most puzzled and confounded you was a fine tar family in the a act of impaling himself at the second mystery 's the harpoon when i ashes all with myself i lay perfectly other and think"

In [47]:
model.fit(x, y, batch_size=32, epochs=100, verbose=2)

Epoch 1/100
354/354 - 9s - loss: 0.7334 - accuracy: 0.8170 - 9s/epoch - 26ms/step
Epoch 2/100
354/354 - 9s - loss: 0.7685 - accuracy: 0.8062 - 9s/epoch - 26ms/step
Epoch 3/100
354/354 - 9s - loss: 0.7148 - accuracy: 0.8165 - 9s/epoch - 25ms/step
Epoch 4/100
354/354 - 9s - loss: 0.6765 - accuracy: 0.8288 - 9s/epoch - 25ms/step
Epoch 5/100
354/354 - 9s - loss: 0.6535 - accuracy: 0.8378 - 9s/epoch - 25ms/step
Epoch 6/100
354/354 - 9s - loss: 0.6855 - accuracy: 0.8276 - 9s/epoch - 25ms/step
Epoch 7/100
354/354 - 9s - loss: 0.6966 - accuracy: 0.8198 - 9s/epoch - 25ms/step
Epoch 8/100
354/354 - 9s - loss: 0.6760 - accuracy: 0.8313 - 9s/epoch - 25ms/step
Epoch 9/100
354/354 - 9s - loss: 0.6917 - accuracy: 0.8277 - 9s/epoch - 25ms/step
Epoch 10/100
354/354 - 9s - loss: 0.6907 - accuracy: 0.8227 - 9s/epoch - 25ms/step
Epoch 11/100
354/354 - 9s - loss: 0.6699 - accuracy: 0.8266 - 9s/epoch - 25ms/step
Epoch 12/100
354/354 - 9s - loss: 0.6602 - accuracy: 0.8344 - 9s/epoch - 25ms/step
Epoch 13/100


<keras.callbacks.History at 0x7f3b0c1d5350>

In [49]:
model.save('mobydic_model.h5')
num_gen_words = 80
n = np.random.randint(0, len(text_sequences))
generate_text(model, tokenizer, seq_len, ' '.join(text_sequences[n]), num_gen_words)

"a mind was concerned he sell to him i'm sleep crush himself on the corn cob of in scores but jump at unlacing the light monday sleep sick queequeg horn or sadly abreast out in a bit of mirror there of making getting presently as the forlorn room i then placed they for curious an guttural place with so bitter all up well that the terrible persians however the stranger and the third building i lay off what then there"