In [0]:
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.preprocessing.text import Tokenizer
import spacy
import numpy as np
import sys
import io

import warnings
warnings.filterwarnings('ignore')

In [3]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4MB)
[K     |████████████████████████████████| 96.4MB 693kB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-cp36-none-any.whl size=98051305 sha256=e1808f1f37ab5067753177a70f54faa5c6de4db3557437bd5e794707530d3746
  Stored in directory: /tmp/pip-ephem-wheel-cache-p47b2eu8/wheels/df/94/ad/f5cf59224cea6b5686ac4fd1ad19c8a07bc026e13c36502d81
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [0]:
import en_core_web_md
nlp = en_core_web_md.load()


In [0]:
nlp.max_length = 16582681

In [7]:
# glove upload
from google.colab import files
uploaded = files.upload()

Saving glove.6B.50d.txt to glove.6B.50d.txt


In [8]:
# text file upload
from google.colab import files
uploaded = files.upload()

Saving text.txt to text.txt


In [9]:
embeddings = {}
f = open('glove.6B.50d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings[word] = coefs
f.close()

print(f'Found {len(embeddings)} word vectors.')

Found 400000 word vectors.


In [10]:
embeddings['sad'] 

array([ 0.18822 ,  0.52772 , -0.80729 , -0.18974 ,  0.73361 , -0.52599 ,
        0.73379 ,  1.151   , -1.0057  ,  0.53222 , -0.53503 , -0.46232 ,
       -0.35761 , -0.089558,  1.1745  ,  0.025105, -0.26076 ,  0.57176 ,
        0.51661 ,  0.39261 , -1.2262  ,  0.96739 ,  0.14591 ,  0.67439 ,
        1.0324  , -0.9346  , -1.8862  ,  1.2702  ,  1.0383  , -0.093612,
        1.7631  ,  0.13482 ,  0.6586  ,  0.017446, -0.23751 ,  0.080928,
        0.40966 , -0.56527 ,  0.43035 , -0.30735 , -0.6366  ,  0.042546,
       -0.23112 , -0.46408 , -0.04127 ,  0.86248 , -0.31139 ,  0.37836 ,
        0.037122,  0.74944 ], dtype=float32)

In [0]:
with open('text.txt', 'r') as f:
  text = f.read()

In [0]:
nlp.add_pipe(nlp.create_pipe('sentencizer'))
doc = nlp(text, disable=['ner', 'parser'])

In [0]:
sentences = [sent.string.strip() for sent in doc.sents]

In [14]:
sentences[0:20]

['Transcribed from the 1915 Martin Secker edition by David Price, email ccx074@pglaf.org                            [Picture: Book cover]                                    THE LESSON OF                                 THE MASTER                                 BY HENRY JAMES                        [Picture: Decorative graphic]                                  * * * * *                            LONDON: MARTIN SECKER                      NUMBER FIVE JOHN STREET ADELPHI                                  * * * * *                       This edition first printed 1915                                  * * * * *     I   HE had been told the ladies were at church, but this was corrected by what he saw from the top of the steps—they descended from a great height in two arms, with a circular sweep of the most charming effect—at the threshold of the door which, from the long bright gallery, overlooked the immense lawn.',
 'Three gentlemen, on the grass, at a distance, sat under the great trees,

In [0]:
maxlen = 20
step = 1
max_num_word = 10000

samples_sentences = []
next_word = []

In [16]:
# tokenize and transform each sentence into a sequence of ints
tokenizer = Tokenizer(num_words=max_num_word)
tokenizer.fit_on_texts(sentences)
list_tokenized_train = tokenizer.texts_to_sequences(sentences)

if len(tokenizer.word_index) < max_num_word:
    max_num_word = len(tokenizer.word_index)
    
print('Number of words:', max_num_word)

Number of words: 10000


In [17]:
len(tokenizer.word_index)

45332

In [18]:
len(sentences)

150009

In [19]:
len(list_tokenized_train)

150009

In [0]:
token_word = []
for line in range(0, len(sentences)):
  that_sentences = list_tokenized_train[line]
  for i in range(0, len(that_sentences)):
    token_word.append(that_sentences[i])

In [21]:
len(token_word)

2890983

In [22]:
for i in range(0, len(token_word) - maxlen, step):
  samples_sentences.append(token_word[i:i + maxlen])
  next_word.append(token_word[i + maxlen])
print('Number of sentences:', len(samples_sentences))

Number of sentences: 2890963


In [0]:
subset = samples_sentences[:100000]

In [0]:
# normalize sequences
x = np.asarray(subset).astype('float32')/max_num_word


In [0]:
# one hot encode y
y = np.zeros((len(subset), max_num_word), dtype=np.bool)

for i in range(0, len(subset)):
  for j in range(0, maxlen):
    y[i, next_word[j]] = 1

Building Neural Net

In [26]:
model = Sequential()

model.add(layers.Embedding(max_num_word, 200, input_length=maxlen))
model.add(layers.LSTM(256))
model.add(layers.Dense(max_num_word, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 200)           2000000   
_________________________________________________________________
lstm (LSTM)                  (None, 256)               467968    
_________________________________________________________________
dense (Dense)                (None, 10000)             2570000   
Total params: 5,037,968
Trainable params: 5,037,968
Non-trainable params: 0
_________________________________________________________________


In [27]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.fit(x, y, batch_size=256, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x7f2eefecd780>

In [0]:
# setting custom temperature for getting a prediction distribution of 
# the next word
def sample(preds, temperature=0.1):
  preds = np.asarray(preds).astype('float64')
  exp_preds = preds - np.exp(temperature)
  preds = exp_preds / np.sum(exp_preds)
  probas = np.random.multinomial(1, preds, 1)
  return np.argmax(probas)



In [0]:
# this will map ints back to words
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

In [0]:
np.random.seed(1234)
start_index = np.random.randint(0, len(token_word) - maxlen - 1)
generated_seed = token_word[start_index: start_index + maxlen]

In [35]:
generated_text = ' '.join([reverse_word_map.get(i) for i in generated_seed])
print('~~~ Generating with seed ~~~')
print(generated_text)
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

for i in range(40):

    array_seed = np.zeros((maxlen,1))
    array_seed[:,0] = np.asarray(generated_seed).astype('float32')/max_num_word
    
    preds = model.predict(array_seed.transpose(), verbose=0)[0]
    next_index = sample(preds)
    next_word = reverse_word_map.get(next_index)

    generated_seed.append(next_index)       
    generated_seed = generated_seed[1:]
    generated_text = generated_text + ' ' + next_word

print('~~~ Generated text ~~~')
print(generated_text)
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~')




~~~ Generating with seed ~~~
adds idiotic joining wounded amount peasant remarkable possesses dim shape objective system à continues moaned mature piazza mystifying beaumont rescued
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~ Generated text ~~~
adds idiotic joining wounded amount peasant remarkable possesses dim shape objective system à continues moaned mature piazza mystifying beaumont rescued there's rusty reminder fires coated cited presumptuous constancy away arrayed smoked ingenious grimly gorged aided anywhere prattle affectionately more states fully harry aversion visitors research subjection shirt massed isabel's immeasurably basis wasn't newton laying incoherent plain physiognomy offend deplored good
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
