In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
def read_files(filepath):
    with open(filepath) as f:
        str_txt = f.read()
        
    return str_txt

In [3]:
read_files('../input/moby_dick_four_chapters.txt')

'Call me Ishmael.  Some years ago--never mind how long\nprecisely--having little or no money in my purse, and nothing\nparticular to interest me on shore, I thought I would sail about a\nlittle and see the watery part of the world.  It is a way I have of\ndriving off the spleen and regulating the circulation.  Whenever I\nfind myself growing grim about the mouth; whenever it is a damp,\ndrizzly November in my soul; whenever I find myself involuntarily\npausing before coffin warehouses, and bringing up the rear of every\nfuneral I meet; and especially whenever my hypos get such an upper\nhand of me, that it requires a strong moral principle to prevent me\nfrom deliberately stepping into the street, and methodically knocking\npeople\'s hats off--then, I account it high time to get to sea as soon\nas I can.  This is my substitute for pistol and ball.  With a\nphilosophical flourish Cato throws himself upon his sword; I quietly\ntake to the ship.  There is nothing surprising in this.  If t

In [4]:
import spacy
nlp = spacy.load('en', disable=['parser','tagger','ner'])
nlp.max_length = 1198623

In [5]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\ \n \n\\n \\n\\n\\n!\"-#$%&()--.*+,-/:;<=>?@[\\\\]^_`{|}~\\t\\n ']

In [6]:
d = read_files('../input/moby_dick_four_chapters.txt')
tokens = separate_punc(d)
len(tokens)

11445

25 words ---> network predict #26

In [7]:
train_len = 25+1
text_sequences = []

for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)

In [8]:
' '.join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

In [9]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [11]:
for i in sequences[0]:
    print(f"{i} : {tokenizer.index_word[i]}")

958 : call
15 : me
264 : ishmael
52 : some
262 : years
409 : ago
88 : never
220 : mind
130 : how
112 : long
956 : precisely
261 : having
51 : little
44 : or
39 : no
316 : money
7 : in
24 : my
547 : purse
3 : and
151 : nothing
260 : particular
6 : to
2714 : interest
15 : me
25 : on


In [12]:
tokenizer.word_counts

OrderedDict([('call', 27),
             ('me', 2471),
             ('ishmael', 133),
             ('some', 758),
             ('years', 135),
             ('ago', 84),
             ('never', 449),
             ('mind', 164),
             ('how', 321),
             ('long', 374),
             ('precisely', 37),
             ('having', 142),
             ('little', 767),
             ('or', 950),
             ('no', 1003),
             ('money', 120),
             ('in', 5647),
             ('my', 1786),
             ('purse', 71),
             ('and', 9646),
             ('nothing', 281),
             ('particular', 152),
             ('to', 6497),
             ('interest', 24),
             ('on', 1716),
             ('shore', 26),
             ('i', 7150),
             ('thought', 676),
             ('would', 702),
             ('sail', 104),
             ('about', 1014),
             ('a', 10377),
             ('see', 416),
             ('the', 15540),
             ('watery', 26),
  

In [13]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

2719

In [14]:
type(sequences)

list

In [15]:
import numpy as np

In [16]:
sequences = np.array(sequences)
sequences

array([[ 958,   15,  264, ..., 2714,   15,   25],
       [  15,  264,   52, ...,   15,   25,  959],
       [ 264,   52,  262, ...,   25,  959,    5],
       ...,
       [ 954,   12,  167, ...,  263,   54,    2],
       [  12,  167, 2713, ...,   54,    2, 2719],
       [ 167, 2713,    3, ...,    2, 2719,   27]])

In [17]:
from keras.utils import to_categorical

In [18]:
X = sequences[:,:-1]
y = sequences[:,-1]
y

array([  25,  959,    5, ...,    2, 2719,   27])

In [19]:
y = to_categorical(y,num_classes=vocabulary_size+1)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [20]:
seq_len = X.shape[1]

In [21]:
seq_len

25

In [22]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM

In [23]:
def create_model(vocabulary_size,seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(vocabulary_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.summary()
    
    return model

In [24]:
model = create_model(vocabulary_size+1, seq_len)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 25, 25)            68000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 25, 150)           105600    
_________________________________________________________________
lstm_2 (LSTM)                (None, 150)               180600    
_________________________________________________________________
dense_1 (Dense)              (None, 2720)              410720    
Total params: 764,920
Trainable params: 764,920
Non-trainable params: 0
_________________________________________________________________


In [25]:
from pickle import dump, load

In [26]:
model.fit(X, y, batch_size=150, epochs=200, verbose=1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200

In [27]:
model.save('my_mobydick_model.h5')
dump(tokenizer, open('my_simpletokenizer','wb'))

In [28]:
from keras.preprocessing.sequence import pad_sequences

In [29]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = [] 
    input_text = seed_text
    
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        pre_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        pred_word = tokenizer.index_word[pre_word_ind]
        input_text += ' '+pred_word
        output_text.append(pred_word)
        
    return ' '.join(output_text) 

In [30]:
text_sequences[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [31]:
import random
random.seed(101)
random_pick = random.randint(0, len(text_sequences))

In [32]:
random_seed_text = text_sequences[random_pick]

In [33]:
random_seed_text

['queequeg',
 'look',
 'here',
 'you',
 'sabbee',
 'me',
 'i',
 'sabbee',
 'you',
 'this',
 'man',
 'sleepe',
 'you',
 'you',
 'sabbee',
 '\n\n',
 'me',
 'sabbee',
 'plenty"--grunted',
 'queequeg',
 'puffing',
 'away',
 'at',
 'his',
 'pipe',
 'and']

In [34]:
seed_text = ' '.join(random_seed_text)

In [35]:
seed_text

'queequeg look here you sabbee me i sabbee you this man sleepe you you sabbee \n\n me sabbee plenty"--grunted queequeg puffing away at his pipe and'

In [36]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

'the the the the the the the the the the the the the the the the the the the the the the the the the'