In [1]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    return str_text

### tokenize and clean text

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm',disable=['parser','tagger','ner'])

In [3]:
nlp.max_length= 1198623

In [4]:
def seprate_fun(t):
    return [token.text.lower() for token in nlp(t) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [5]:
doc = read_file('UPDATED_NLP_COURSE/06-Deep-Learning/moby_dick_four_chapters.txt')

In [6]:
tokens = seprate_fun(doc)

In [7]:
len(tokens)

11338

### Sequence of token

In [8]:
train_len = 25 + 1  # 25 words for training and last word for prediction
    
text_seq = []

for i in range(train_len , len(tokens)):
    
    seq = tokens[i-train_len : i]
    
    text_seq.append(seq)
    

In [9]:
text_seq[0]  # first sequence

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

### Keras Tokenization

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [11]:
# encode token into number of array list
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(text_seq)
sequence = tokenizer.texts_to_sequences(text_seq)

In [12]:
sequence[0]

[956,
 14,
 263,
 51,
 261,
 408,
 87,
 219,
 129,
 111,
 954,
 260,
 50,
 43,
 38,
 315,
 7,
 23,
 546,
 3,
 150,
 259,
 6,
 2712,
 14,
 24]

In [14]:
for i in sequence[0]:
    print(f"{i}:{tokenizer.index_word[i]}") # same sequence converted to encode numbers...

956:call
14:me
263:ishmael
51:some
261:years
408:ago
87:never
219:mind
129:how
111:long
954:precisely
260:having
50:little
43:or
38:no
315:money
7:in
23:my
546:purse
3:and
150:nothing
259:particular
6:to
2712:interest
14:me
24:on


In [15]:
len(sequence)

11312

In [None]:
tokenizer.word_counts # how many times each word occur in given text file

In [17]:
vocab_size = len(tokenizer.word_counts)

In [18]:
vocab_size

2717

### Convert list to numpy matrix

In [19]:
import numpy as np

In [20]:
sequence_array = np.array(sequence)

In [21]:
sequence_array[0] # first sequence in nd.array format

array([ 956,   14,  263,   51,  261,  408,   87,  219,  129,  111,  954,
        260,   50,   43,   38,  315,    7,   23,  546,    3,  150,  259,
          6, 2712,   14,   24])

In [22]:
seq_len = len(sequence_array)

In [23]:
sequence_array.shape

(11312, 26)

### LSTM

In [24]:
from keras.models import Sequential
from keras.layers import Dense , LSTM , Embedding

In [25]:
def create_model(vocab_size , seq_len):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size ,output_dim = 25 ,input_length = seq_len))
    model.add(LSTM(128 , return_sequences = True))
    model.add(LSTM(128 , return_sequences = False))
    model.add(Dense(256 , activation = 'relu'))
    model.add(Dense(vocab_size , activation='softmax'))
    
    model.compile(loss='categorical_crossentropy' , optimizer='adam' , metrics=['accuracy'])
    
    model.summary()
    
    return model

### Train Test Split

In [26]:
from keras.utils import to_categorical


In [33]:
sequence_array

array([[ 956,   14,  263, ..., 2712,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2711, ...,   53,    2, 2717],
       [ 166, 2711,    3, ...,    2, 2717,   26]])

In [34]:
sequence_array[:, :-1] # except last column

array([[ 956,   14,  263, ...,    6, 2712,   14],
       [  14,  263,   51, ..., 2712,   14,   24],
       [ 263,   51,  261, ...,   14,   24,  957],
       ...,
       [ 952,   12,  166, ...,   11,  262,   53],
       [  12,  166, 2711, ...,  262,   53,    2],
       [ 166, 2711,    3, ...,   53,    2, 2717]])

In [35]:
sequence_array[: , -1:] # last column which we have to predict

array([[  24],
       [ 957],
       [   5],
       ...,
       [   2],
       [2717],
       [  26]])

In [36]:
X = sequence_array[: , :-1]
y = sequence_array[: , -1:]

In [37]:
y = to_categorical(y , num_classes=vocab_size + 1) # we add 1 because for unknown word which index is zero

In [38]:
seq_len = X.shape[1]

### Fit the Model

In [39]:
model = create_model(vocab_size + 1 , seq_len=seq_len)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            67950     
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 128)           78848     
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_2 (Dense)              (None, 2718)              698526    
Total params: 1,009,932
Trainable params: 1,009,932
Non-trainable params: 0
_________________________________________________________________


In [40]:
model.fit(X , y ,batch_size= 128,epochs = 250,verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250


Epoch 77/250
Epoch 78/250
Epoch 79/250
Epoch 80/250
Epoch 81/250
Epoch 82/250
Epoch 83/250
Epoch 84/250
Epoch 85/250
Epoch 86/250
Epoch 87/250
Epoch 88/250
Epoch 89/250
Epoch 90/250
Epoch 91/250
Epoch 92/250
Epoch 93/250
Epoch 94/250
Epoch 95/250
Epoch 96/250
Epoch 97/250
Epoch 98/250
Epoch 99/250
Epoch 100/250
Epoch 101/250
Epoch 102/250
Epoch 103/250
Epoch 104/250
Epoch 105/250
Epoch 106/250
Epoch 107/250
Epoch 108/250
Epoch 109/250
Epoch 110/250
Epoch 111/250
Epoch 112/250
Epoch 113/250
Epoch 114/250
Epoch 115/250
Epoch 116/250
Epoch 117/250
Epoch 118/250
Epoch 119/250
Epoch 120/250
Epoch 121/250
Epoch 122/250
Epoch 123/250
Epoch 124/250
Epoch 125/250
Epoch 126/250
Epoch 127/250
Epoch 128/250
Epoch 129/250
Epoch 130/250
Epoch 131/250
Epoch 132/250
Epoch 133/250
Epoch 134/250
Epoch 135/250
Epoch 136/250
Epoch 137/250
Epoch 138/250
Epoch 139/250
Epoch 140/250
Epoch 141/250
Epoch 142/250
Epoch 143/250
Epoch 144/250
Epoch 145/250
Epoch 146/250
Epoch 147/250
Epoch 148/250
Epoch 149/250
E

Epoch 151/250
Epoch 152/250
Epoch 153/250
Epoch 154/250
Epoch 155/250
Epoch 156/250
Epoch 157/250
Epoch 158/250
Epoch 159/250
Epoch 160/250
Epoch 161/250
Epoch 162/250
Epoch 163/250
Epoch 164/250
Epoch 165/250
Epoch 166/250
Epoch 167/250
Epoch 168/250
Epoch 169/250
Epoch 170/250
Epoch 171/250
Epoch 172/250
Epoch 173/250
Epoch 174/250
Epoch 175/250
Epoch 176/250
Epoch 177/250
Epoch 178/250
Epoch 179/250
Epoch 180/250
Epoch 181/250
Epoch 182/250
Epoch 183/250
Epoch 184/250
Epoch 185/250
Epoch 186/250
Epoch 187/250
Epoch 188/250
Epoch 189/250
Epoch 190/250
Epoch 191/250
Epoch 192/250
Epoch 193/250
Epoch 194/250
Epoch 195/250
Epoch 196/250
Epoch 197/250
Epoch 198/250
Epoch 199/250
Epoch 200/250
Epoch 201/250
Epoch 202/250
Epoch 203/250
Epoch 204/250
Epoch 205/250
Epoch 206/250
Epoch 207/250
Epoch 208/250
Epoch 209/250
Epoch 210/250
Epoch 211/250
Epoch 212/250
Epoch 213/250
Epoch 214/250
Epoch 215/250
Epoch 216/250
Epoch 217/250
Epoch 218/250
Epoch 219/250
Epoch 220/250
Epoch 221/250
Epoch 

Epoch 225/250
Epoch 226/250
Epoch 227/250
Epoch 228/250
Epoch 229/250
Epoch 230/250
Epoch 231/250
Epoch 232/250
Epoch 233/250
Epoch 234/250
Epoch 235/250
Epoch 236/250
Epoch 237/250
Epoch 238/250
Epoch 239/250
Epoch 240/250
Epoch 241/250
Epoch 242/250
Epoch 243/250
Epoch 244/250
Epoch 245/250
Epoch 246/250
Epoch 247/250
Epoch 248/250
Epoch 249/250
Epoch 250/250


<keras.callbacks.callbacks.History at 0x22230db2dc8>

In [None]:
from pickle import dump,load

### Genrating Text

In [41]:
from random import randint
from keras.preprocessing.sequence import pad_sequences

In [42]:
def genrate_text(tokenizer,seed_text , seq_len , model , num_gen_words):
    
    output_text = []
    input_text = seed_text
    
    for i in range(num_gen_words):
        
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        pad_encoded = pad_sequences([encoded_text] , maxlen=seq_len , truncating='pre')
        
        pred_word_ind = model.predict_classes(pad_encoded , verbose = 0)[0]
        
        pred_word = tokenizer.index_word[pred_word_ind]
        
        input_text += ' '+pred_word
        
        output_text.append(pred_word)
        
    return ' '.join(output_text)

In [43]:
len(text_seq)

11312

In [44]:
import random

In [63]:
random.seed()
random_pick = random.randint(0 , len(text_seq))
seed_text = ' '.join(text_seq[random_pick])

In [64]:
seed_text

"again and seeing no possible chance of spending a sufferable night unless in some other person 's bed i began to think that after all i"

In [65]:
genrate_text(tokenizer=tokenizer , seed_text=seed_text , seq_len = seq_len , model=model ,num_gen_words=27)

'might be cherishing unwarrantable prejudices against this unknown harpooneer thinks i the bar room when knowing all it was a very dubious looking nay a very dark'