In [1]:
from __future__ import print_function

import numpy as np
import gensim
import string

from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import get_file

print('\nFetching the text...')
url = 'https://raw.githubusercontent.com/maxim5/stanford-tensorflow-tutorials/master/data/arxiv_abstracts.txt'
path = get_file('arxiv_abstracts.txt', origin=url)

print('\nPreparing the sentences...')
max_sentence_len = 40
with open(path) as file_:
    docs = file_.readlines()
# file = open("long.txt").read()
# docs=file.split('\n')
# docs=list(file)

sentences = [[word for word in doc.lower().translate(string.punctuation).split()[:max_sentence_len]] for doc in docs]
print('Num sentences:', len(sentences))

print('\nTraining word2vec...')
word_model = gensim.models.Word2Vec(sentences, size=100, min_count=1, window=5, iter=100)
pretrained_weights = word_model.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)
print('Checking similar words:')
for word in ['model', 'network', 'train', 'learn']:
    most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in word_model.most_similar(word)[:8])
    print('  %s -> %s' % (word, most_similar))

def word2idx(word):
    return word_model.wv.vocab[word].index
def idx2word(idx):
    return word_model.wv.index2word[idx]

print('\nPreparing the data for LSTM...')
train_x = np.zeros([len(sentences), max_sentence_len], dtype=np.int32)
train_y = np.zeros([len(sentences)], dtype=np.int32)
for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence[:-1]):
        train_x[i, t] = word2idx(word)
        train_y[i] = word2idx(sentence[-1])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)

print('\nTraining LSTM...')
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
model.add(LSTM(units=emdedding_size))
model.add(Dense(units=vocab_size))
model.add(Activation('softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

def sample(preds, temperature=1.0):
    if temperature <= 0:
        return np.argmax(preds)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_next(text, num_generated=10):
    word_idxs = [word2idx(word) for word in text.lower().split()]
    for i in range(num_generated):
        prediction = model.predict(x=np.array(word_idxs))
        idx = sample(prediction[-1], temperature=0.7)
        word_idxs.append(idx)
    return ' '.join(idx2word(idx) for idx in word_idxs)

def on_epoch_end(epoch, _):
    print('\nGenerating text after epoch: %d' % epoch)
    texts = [
    'deep convolutional',
    'simple and effective',
    'a nonconvex',
    'a'
    ]
    break_s='\n'+'-'*50+'\n'
    for text in texts:
        sample = generate_next(text)
        print('%s... -> %s%s' % (text, sample,break_s))

model.fit(train_x, train_y,
          batch_size=128,
          epochs=2,
          callbacks=[LambdaCallback(on_epoch_end=on_epoch_end)])


Fetching the text...

Preparing the sentences...
Num sentences: 7200

Training word2vec...




Result embedding shape: (1351, 100)
Checking similar words:
  model -> technique (0.36), $l_p$ (0.36), architecture. (0.34), extend (0.34), trains (0.33), approach (0.32), of (0.30), rnn, (0.29)
  network -> networks (0.41), constrained (0.24), architecture (0.24), given (0.23), trained (0.23), there (0.21), help (0.21), accepted (0.21)
  train -> based (0.42), eigendecompositions (0.34), classical (0.33), improve (0.31), construct (0.30), then (0.29), average (0.29), extend (0.28)
  learn -> automatically (0.40), tend (0.37), adopted (0.37), remain (0.35), effectively (0.35), adapt (0.35), relevant (0.34), lower (0.34)

Preparing the data for LSTM...
train_x shape: (7200, 40)
train_y shape: (7200,)

Training LSTM...
Train on 7200 samples
Epoch 1/2
Generating text after epoch: 0
deep convolutional... -> deep convolutional (dbn) now discuss back-propagated datasets (rnn) distributions. labels theoretical experimentally,
--------------------------------------------------

simple and effe

<tensorflow.python.keras.callbacks.History at 0x7efdb2243350>

In [8]:
predict=model.predict(train_x)