In [1]:
import tensorflow as tf
from tensorflow.contrib import seq2seq

In [2]:
class Model:
    def __init__(self,w2v,maxlen=50, 
                 vocabulary_size=20000,
                 output_size=512, 
                 learning_rate=1e-3,
                 embedding_size = 256,
                 batch_size=16,
                 max_grad_norm=10):
        special_embeddings = tf.get_variable(
            'special_embeddings',
            shape=[4, embedding_size],
            initializer=tf.initializers.random_uniform(-np.sqrt(3), np.sqrt(3)),
            trainable=False)
        word_embeddings = tf.get_variable(
            "word_embeddings", 
            shape=[vocabulary_size, embedding_size],
            initializer=tf.initializers.constant(w2v.vectors[:vocabulary_size]),
            trainable=False)
        self.global_step = tf.get_variable(
            "global_step", shape=[], trainable=False,
            initializer=tf.initializers.zeros())
        self.embeddings = tf.concat([special_embeddings, word_embeddings], 0)
        self.output_layer = tf.layers.Dense(vocabulary_size, name="output_layer")
        self.output_layer.build(output_size)
        
        self.BEFORE = tf.placeholder(tf.int32,[None,maxlen])
        self.INPUT = tf.placeholder(tf.int32,[None,maxlen])
        self.AFTER = tf.placeholder(tf.int32,[None,maxlen])
        
        self.get_thought = self.thought(self.INPUT)
        fw_logits = self.decoder(self.get_thought, self.AFTER)
        bw_logits = self.decoder(self.get_thought, self.BEFORE)
        self.loss = self.calculate_loss(fw_logits, self.AFTER) + self.calculate_loss(bw_logits, self.BEFORE)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), max_grad_norm)

        self.optimizer = tf.train.AdamOptimizer(learning_rate).apply_gradients(
            zip(grads, tvars), global_step=self.global_step)
        
    def get_embedding(self, inputs):
        return tf.nn.embedding_lookup(self.embeddings, inputs)
        
    def thought(self, inputs):
        encoder_in = self.get_embedding(inputs)
        fw_cell = tf.nn.rnn_cell.GRUCell(output_size)
        bw_cell = tf.nn.rnn_cell.GRUCell(output_size)
        sequence_length = tf.reduce_sum(tf.sign(inputs), axis=1)
        rnn_output = tf.nn.bidirectional_dynamic_rnn(
            fw_cell, bw_cell, encoder_in, sequence_length=sequence_length,
            dtype=tf.float32)[1]
        return sum(rnn_output)
        
    def decoder(self, thought, labels):
        sos_tokens = tf.constant([[2]] * batch_size, dtype=tf.int32)
        shifted_labels = tf.concat([sos_tokens, labels[:,:-1]], 1)
        decoder_in = self.get_embedding(shifted_labels)
        cell = tf.nn.rnn_cell.GRUCell(output_size)
        max_seq_lengths = tf.constant([maxlen] * batch_size)
        helper = seq2seq.TrainingHelper(decoder_in, max_seq_lengths, time_major=False)
        decoder = seq2seq.BasicDecoder(cell, helper, thought)
        decoder_out = seq2seq.dynamic_decode(decoder)[0].rnn_output
        return decoder_out
        
    def calculate_loss(self, outputs, labels):
        mask = tf.cast(tf.sign(labels), tf.float32)
        logits = self.output_layer(outputs)
        return seq2seq.sequence_loss(logits, labels, mask)

In [3]:
import re
from tqdm import tqdm
import random
import numpy as np

In [4]:
def sentences(s):
    result = []
    for sentence in s.split('.'):
        sentence = re.sub(r"[^A-Za-z0-9 ']", " ", sentence)
        sentence = re.sub(r"[ ]+", " ", sentence).strip()
        result.append(sentence)
    return result

def sequence(s, w2v_model, maxlen, vocabulary_size):
    words = s.split()
    np_array = np.zeros((maxlen),dtype=np.int32)
    current_no = 0
    for no, word in enumerate(words[:maxlen - 2]):
        id_to_append = 1
        if word in w2v_model:
            word_id = w2v_model.vocab[word].index + 4
            if word_id < vocabulary_size:
                id_to_append = word_id
        np_array[no] = id_to_append
        current_no = no
    np_array[current_no + 1] = 3
    return np_array

def generate_batch(sentences,batch_size,w2v_model,maxlen,vocabulary_size):
    window_size = batch_size + 2
    first_index = random.randint(0, len(sentences) - window_size)
    batch_sentences = sentences[first_index:first_index+window_size]
    batch_sequences = np.array([sequence(sentence,w2v_model,maxlen,vocabulary_size) for sentence in batch_sentences])
    window_shape = []
    for i in range(batch_size):
        window_shape.append(batch_sequences[i:i+3])
    window_shape = np.array(window_shape)
    return window_shape[:,0], window_shape[:,1], window_shape[:,2]

In [5]:
import os
contents = []
for filename in tqdm(os.listdir('books')):
    with open(os.path.join('books', filename)) as f:
        contents.extend(sentences(f.read()))

100%|██████████| 27/27 [00:00<00:00, 28.66it/s]


In [6]:
from gensim.models import KeyedVectors
w2v_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [7]:
w2v_model.vectors[:10].shape

(10, 300)

In [8]:
maxlen = 50
vocabulary_size = 20000
output_size = 300
learning_rate = 1e-3
embedding_size = w2v_model.vectors[:1].shape[1]
batch_size = 16

In [9]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(w2v_model,embedding_size=embedding_size,output_size=output_size)
sess.run(tf.global_variables_initializer())

In [10]:
for i in range(10):
    pbar = tqdm(range(0, 5000), desc='train minibatch loop')
    for p in pbar:
        bw_input, current_input, fw_input = generate_batch(contents,batch_size,w2v_model,maxlen,vocabulary_size)
        loss, _ = sess.run([model.loss, model.optimizer], 
                           feed_dict = {model.BEFORE : bw_input, 
                                        model.INPUT : current_input,
                                        model.AFTER: fw_input})
        pbar.set_postfix(cost=loss)

train minibatch loop: 100%|██████████| 5000/5000 [11:44<00:00,  7.42it/s, cost=8.77] 
train minibatch loop: 100%|██████████| 5000/5000 [11:44<00:00,  7.06it/s, cost=8.69]    
train minibatch loop: 100%|██████████| 5000/5000 [11:45<00:00,  6.88it/s, cost=7.81]    
train minibatch loop: 100%|██████████| 5000/5000 [11:47<00:00,  6.75it/s, cost=9.93]    
train minibatch loop: 100%|██████████| 5000/5000 [11:43<00:00,  6.83it/s, cost=7.54]    
train minibatch loop: 100%|██████████| 5000/5000 [11:56<00:00,  6.45it/s, cost=6.9]     
train minibatch loop: 100%|██████████| 5000/5000 [12:05<00:00,  7.09it/s, cost=8.92]    
train minibatch loop: 100%|██████████| 5000/5000 [12:09<00:00,  6.54it/s, cost=7.74]    
train minibatch loop: 100%|██████████| 5000/5000 [12:01<00:00,  7.24it/s, cost=7.79]    
train minibatch loop: 100%|██████████| 5000/5000 [11:51<00:00,  6.98it/s, cost=8.4]     


In [11]:
with open('books/Blood_Born') as f:
    book = sentences(f.read())

book_sequences = [sequence(sentence, w2v_model, maxlen, vocabulary_size) for sentence in book]
encoded = sess.run(model.get_thought,feed_dict={model.INPUT:np.array(book_sequences)})

In [17]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

n_clusters = int(np.ceil(len(encoded)**0.5))
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans = kmeans.fit(encoded)
avg = []
closest = []
for j in range(n_clusters):
    idx = np.where(kmeans.labels_ == j)[0]
    avg.append(np.mean(idx))
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,encoded)
ordering = sorted(range(n_clusters), key=lambda k: avg[k])
print('. '.join([book[closest[idx]] for idx in ordering]))

