Reference: https://gist.github.com/maxim5/c35ef2238ae708ccb0e55624e9e0252b

In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import print_function

__author__ = 'maxim'

import numpy as np
import gensim
import string

from keras.callbacks import LambdaCallback
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.utils.data_utils import get_file

print('\nFetching the text...')
url = 'https://raw.githubusercontent.com/maxim5/stanford-tensorflow-tutorials/master/data/arxiv_abstracts.txt'
path = get_file('arxiv_abstracts.txt', origin=url)

print('\nPreparing the sentences...')
max_sentence_len = 40
with open(path) as file_:
    docs = file_.readlines()
    
import string
#make translator object
translator=str.maketrans('','',string.punctuation)

sentences = [[word for word in doc.lower().translate(translator).split()[:max_sentence_len]] for doc in docs]
print('Num sentences:', len(sentences))



Fetching the text...

Preparing the sentences...
Num sentences: 7200


In [4]:
sentences[0]

['in',
 'science',
 'and',
 'engineering',
 'intelligent',
 'processing',
 'of',
 'complex',
 'signals',
 'such',
 'as',
 'images',
 'sound',
 'or',
 'language',
 'is',
 'often',
 'performed',
 'by',
 'a',
 'parameterized',
 'hierarchy',
 'of',
 'nonlinear',
 'processing',
 'layers',
 'sometimes',
 'biologically',
 'inspired',
 'hierarchical',
 'systems',
 'or',
 'more',
 'generally',
 'nested',
 'systems',
 'offer',
 'a',
 'way',
 'to']

In [5]:

print('\nTraining word2vec...')
word_model = gensim.models.Word2Vec(sentences, size=100, min_count=1, window=5, iter=100)
pretrained_weights = word_model.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)
print('Checking similar words:')
for word in ['model', 'network', 'train', 'learn']:
    most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in word_model.most_similar(word)[:8])
    print('  %s -> %s' % (word, most_similar))



Training word2vec...
Result embedding shape: (1166, 100)
Checking similar words:
  model -> comprise (0.37), via (0.31), lp (0.30), contain (0.30), subclass (0.30), connected (0.28), extend (0.28), context (0.28)
  network -> networks (0.35), given (0.30), constrained (0.27), trained (0.25), lies (0.24), near (0.22), algorithm (0.22), represent (0.21)
  train -> based (0.40), eigendecompositions (0.33), average (0.30), derive (0.28), then (0.28), performing (0.27), improvement (0.27), construct (0.27)
  learn -> automatically (0.36), relevant (0.35), realize (0.35), units (0.34), consistency (0.34), upper (0.33), respect (0.33), lower (0.32)


  after removing the cwd from sys.path.
  if __name__ == '__main__':


In [20]:
sentence

['we',
 'introduce',
 'a',
 'new',
 'representation',
 'learning',
 'algorithm',
 'suited',
 'to',
 'the',
 'context',
 'of',
 'domain',
 'adaptation',
 'in',
 'which',
 'data',
 'at',
 'training',
 'and',
 'test',
 'time',
 'come',
 'from',
 'similar',
 'but',
 'different',
 'distributions',
 'our',
 'algorithm',
 'is',
 'directly',
 'inspired',
 'by',
 'theory',
 'on',
 'domain',
 'adaptation',
 'suggesting',
 'that']

In [19]:
sentence[:-1]

['we',
 'introduce',
 'a',
 'new',
 'representation',
 'learning',
 'algorithm',
 'suited',
 'to',
 'the',
 'context',
 'of',
 'domain',
 'adaptation',
 'in',
 'which',
 'data',
 'at',
 'training',
 'and',
 'test',
 'time',
 'come',
 'from',
 'similar',
 'but',
 'different',
 'distributions',
 'our',
 'algorithm',
 'is',
 'directly',
 'inspired',
 'by',
 'theory',
 'on',
 'domain',
 'adaptation',
 'suggesting']

In [21]:
len(sentences)

7200

In [6]:
def word2idx(word):
    return word_model.wv.vocab[word].index
def idx2word(idx):
    return word_model.wv.index2word[idx]

print('\nPreparing the data for LSTM...')
train_x = np.zeros([len(sentences), max_sentence_len], dtype=np.int32)
train_y = np.zeros([len(sentences)], dtype=np.int32)
for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence[:-1]):
        train_x[i, t] = word2idx(word)
    train_y[i] = word2idx(sentence[-1])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)



Preparing the data for LSTM...
train_x shape: (7200, 40)
train_y shape: (7200,)


In [12]:
type(train_x)

numpy.ndarray

In [13]:
max_sentence_len

40

In [17]:
len(train_x[0])

40

In [18]:
len(train_x[1])

40

In [9]:
train_x[0]

array([  4, 275,   5, 476, 477, 122,   1, 123, 144,  29,  19, 145, 478,
        39, 276,  12, 124, 277,  18,   2, 479, 188,   1,  93, 122,  44,
       480, 481, 146, 189, 190,  39,  62, 482, 278, 190, 483,   2, 279,
         0], dtype=int32)

In [14]:
train_x[1]

array([191, 484, 192, 280, 147,  10, 485,  20, 193,   4,  34,  11,   4,
        45, 486,   0,  63,   1, 487,   5, 488, 489,  64,  30,  76,  12,
       281,   3,  25, 194,  10, 490, 280, 147, 125, 195,  94,  25, 194,
         0], dtype=int32)

In [15]:
train_x[2]

array([491, 492,  40, 493,  10,  35, 194, 494,   1, 196, 495,   3,  35,
       197,  52, 124, 282, 283,   4,  95, 284, 148,   0,  41,  12, 496,
         3, 497, 198, 498,   0,  96,  77,   3, 499,  21,  53, 199, 500,
         0], dtype=int32)

In [16]:
train_x[3]

array([  9,  65,  54, 502,  55,   5, 503,  64, 504, 505,  30, 285,  42,
        10,  97, 285,   5, 506,  42,  17,  46, 507, 200, 508,  10, 149,
        37, 150,  66, 509, 151,  64,  30,  76, 201, 125,  66, 202, 286,
         0], dtype=int32)

In [11]:
print('\nTraining LSTM...')
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
model.add(LSTM(units=emdedding_size))
model.add(Dense(units=vocab_size))
model.add(Activation('softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

def sample(preds, temperature=1.0):
    if temperature <= 0:
        return np.argmax(preds)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_next(text, num_generated=10):
    word_idxs = [word2idx(word) for word in text.lower().split()]
    for i in range(num_generated):
        prediction = model.predict(x=np.array(word_idxs))
        idx = sample(prediction[-1], temperature=0.7)
        word_idxs.append(idx)
    return ' '.join(idx2word(idx) for idx in word_idxs)

def on_epoch_end(epoch, _):
    print('\nGenerating text after epoch: %d' % epoch)
    texts = [
    'deep convolutional',
    'simple and effective',
    'a nonconvex',
    'a',
    ]
    for text in texts:
        sample = generate_next(text)
        print('%s... -> %s' % (text, sample))

model.fit(train_x, train_y,
          batch_size=128,
          epochs=20,
          callbacks=[LambdaCallback(on_epoch_end=on_epoch_end)])



Training LSTM...
Epoch 1/20

Generating text after epoch: 0
deep convolutional... -> deep convolutional optimization anrat on correct these asynchronous has resnet regularizing effect
simple and effective... -> simple and effective computations largest efficiently consistently family decay autoencoders universal adopted evolving
a nonconvex... -> a nonconvex potential number networks gprop symmetries arguing descent expensive enough operates
a... -> a parametrized compiler adversarial parameters better traditional code improves various many
Epoch 2/20

Generating text after epoch: 1
deep convolutional... -> deep convolutional including pseudoensemble prominent been code efficient cost knowledge synaptic from
simple and effective... -> simple and effective behavioral unseen gives larger protocol potential better emits of foundation
a nonconvex... -> a nonconvex protocol solving in h theoretical unfortunately source use error unseen
a... -> a can architectures insight compatible success


Generating text after epoch: 14
deep convolutional... -> deep convolutional hypotheses depends science within language crafted extends intimately stacking projections
simple and effective... -> simple and effective important based spurious offers directly pretraining improvements variation sampling intensive
a nonconvex... -> a nonconvex only 2012 been intimately previously essay efficient comprise operations computable
a... -> a lateral its aims batch ffns this et operations variety belief
Epoch 16/20

Generating text after epoch: 15
deep convolutional... -> deep convolutional their signals norm achievable popular boltzmann pattern depthdependency flowing aspects
simple and effective... -> simple and effective consider domain of update svrg expense child suffer use multilayer
a nonconvex... -> a nonconvex various decisions poor areas widelypopular deterministic phonemes gives with whereas
a... -> a of richer commonly understand promise algebra continuous probabilistic suggesting stan

<keras.callbacks.History at 0x1a2deee5c0>