In [1]:
import json
import numpy as np
import tensorflow as tf
import collections
from sklearn.cross_validation import train_test_split
from tensor2tensor.utils import beam_search



In [2]:
with open('ctexts.json','r') as fopen:
    ctexts = json.load(fopen)
    
with open('headlines.json','r') as fopen:
    headlines = json.load(fopen)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def topic_modelling(string, n = 500):
    vectorizer = TfidfVectorizer()
    tf = vectorizer.fit_transform([string])
    tf_features = vectorizer.get_feature_names()
    compose = TruncatedSVD(1).fit(tf)
    return ' '.join([tf_features[i] for i in compose.components_[0].argsort()[: -n - 1 : -1]])

In [4]:
%%time
h, c = [], []
for i in range(len(ctexts)):
    try:
        c.append(topic_modelling(ctexts[i]))
        h.append(headlines[i])
    except:
        pass

  self.explained_variance_ratio_ = exp_var / full_var


CPU times: user 18.7 s, sys: 24.5 s, total: 43.2 s
Wall time: 10.9 s


In [5]:
def build_dataset(words, n_words):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    count.extend(collections.Counter(words).most_common(n_words))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [6]:
concat = ' '.join(c).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])
print('filtered vocab size:',len(dictionary))
print("% of vocab used: {}%".format(round(len(dictionary)/vocabulary_size,4)*100))

vocab from size: 49585
Most common words [('dot', 4394), ('the', 4379), ('comma', 4349), ('to', 4280), ('in', 4268), ('of', 4262)]
Sample data [5, 7, 4, 6, 10, 9, 11, 8, 15, 2062] ['the', 'to', 'dot', 'comma', 'and', 'of', 'on', 'in', 'was', 'festival']
filtered vocab size: 49589
% of vocab used: 100.01%


In [7]:
for i in range(len(h)):
    h[i] = h[i] + ' EOS'
h[0]

'daman and diu revokes mandatory rakshabandhan in offices order EOS'

In [8]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [9]:
def str_idx(corpus, dic, UNK=3):
    X = []
    for i in corpus:
        ints = []
        for k in i.split():
            ints.append(dic.get(k, UNK))
        X.append(ints)
    return X

In [10]:
X = str_idx(c, dictionary)
Y = str_idx(h, dictionary)

In [11]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = 0.05)

In [12]:
def embed_seq(x, vocab_sz, embed_dim, name, zero_pad=True): 
    embedding = tf.get_variable(name, [vocab_sz, embed_dim]) 
    if zero_pad:
        embedding = tf.concat([tf.zeros([1, embed_dim]), embedding[1:, :]], 0) 
    x = tf.nn.embedding_lookup(embedding, x)
    return x

def position_encoding(inputs):
    T = tf.shape(inputs)[1]
    repr_dim = inputs.get_shape()[-1].value
    pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])
    i = np.arange(0, repr_dim, 2, np.float32)
    denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])
    enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)
    return tf.tile(enc, [tf.shape(inputs)[0], 1, 1])

def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))
    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    return gamma * normalized + beta


def cnn_block(x, dilation_rate, pad_sz, hidden_dim, kernel_size):
    x = layer_norm(x)
    pad = tf.zeros([tf.shape(x)[0], pad_sz, hidden_dim])
    x =  tf.layers.conv1d(inputs = tf.concat([pad, x, pad], 1),
                          filters = hidden_dim,
                          kernel_size = kernel_size,
                          dilation_rate = dilation_rate)
    x = x[:, :-pad_sz, :]
    x = tf.nn.relu(x)
    return x

class Summarization:
    def __init__(self, size_layer, num_layers, embedded_size, 
                 dict_size, learning_rate, 
                 kernel_size = 2, n_attn_heads = 16):

        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])

        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype = tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype = tf.int32)
        batch_size = tf.shape(self.X)[0]
        self.batch_size = batch_size
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        
        self.embedding = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        
        self.num_layers = num_layers
        self.kernel_size = kernel_size
        self.size_layer = size_layer
        self.n_attn_heads = n_attn_heads
        self.dict_size = dict_size
        
        self.training_logits = self.forward(self.X, decoder_input)

        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
    def forward(self, x, y, reuse = False):
        with tf.variable_scope('forward',reuse=reuse):
            with tf.variable_scope('forward',reuse=reuse):
                encoder_embedded = tf.nn.embedding_lookup(self.embedding, x)
                decoder_embedded = tf.nn.embedding_lookup(self.embedding, y)
                encoder_embedded += position_encoding(encoder_embedded)

                for i in range(self.num_layers): 
                    dilation_rate = 2 ** i
                    pad_sz = (self.kernel_size - 1) * dilation_rate 
                    with tf.variable_scope('block_%d'%i,reuse=reuse):
                        encoder_embedded += cnn_block(encoder_embedded, dilation_rate, 
                                                      pad_sz, self.size_layer, self.kernel_size)

                g = tf.identity(decoder_embedded)
                for i in range(self.num_layers):
                    dilation_rate = 2 ** i
                    pad_sz = (self.kernel_size - 1) * dilation_rate
                    with tf.variable_scope('decode_%d'%i,reuse=reuse):
                        attn_res = h = cnn_block(decoder_embedded, dilation_rate, 
                                                 pad_sz, self.size_layer, self.kernel_size)
                        C = []
                        for j in range(self.n_attn_heads):
                            h_ = tf.layers.dense(h, self.size_layer//self.n_attn_heads)
                            g_ = tf.layers.dense(g, self.size_layer//self.n_attn_heads)
                            zu_ = tf.layers.dense(encoder_embedded, self.size_layer//self.n_attn_heads)
                            ze_ = tf.layers.dense(encoder_embedded, self.size_layer//self.n_attn_heads)

                            d = tf.layers.dense(h_, self.size_layer//self.n_attn_heads) + g_
                            dz = tf.matmul(d, tf.transpose(zu_, [0, 2, 1]))
                            a = tf.nn.softmax(dz)
                            c_ = tf.matmul(a, ze_)
                            C.append(c_)

                        c = tf.concat(C, 2)
                        h = tf.layers.dense(attn_res + c, self.size_layer)
                        decoder_embedded += h

                return tf.layers.dense(decoder_embedded, self.dict_size)

In [13]:
size_layer = 128
num_layers = 4
embedded_size = 128
learning_rate = 1e-3
batch_size = 16
epoch = 20

In [14]:
def beam_search_decoding(length = 20, beam_width = 5):
    initial_ids = tf.fill([model.batch_size], GO)
    
    def symbols_to_logits(ids):
        x = tf.contrib.seq2seq.tile_batch(model.X, beam_width)
        logits = model.forward(x, ids, reuse = True)
        return logits[:, tf.shape(ids)[1]-1, :]

    final_ids, final_probs = beam_search.beam_search(
        symbols_to_logits,
        initial_ids,
        beam_width,
        length,
        len(dictionary),
        0.0,
        eos_id = EOS)
    
    return final_ids

In [15]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Summarization(size_layer, num_layers, embedded_size, 
                len(dictionary), learning_rate)
model.generate = beam_search_decoding()
sess.run(tf.global_variables_initializer())

In [16]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [17]:
from tqdm import tqdm
from sklearn.utils import shuffle
import time

for EPOCH in range(10):
    lasttime = time.time()
    total_loss, total_accuracy, total_loss_test, total_accuracy_test = 0, 0, 0, 0
    train_X, train_Y = shuffle(train_X, train_Y)
    test_X, test_Y = shuffle(test_X, test_Y)
    pbar = tqdm(range(0, len(train_X), batch_size), desc='train minibatch loop')
    for k in pbar:
        batch_x, _ = pad_sentence_batch(train_X[k: min(k+batch_size,len(train_X))], PAD)
        batch_y, _ = pad_sentence_batch(train_Y[k: min(k+batch_size,len(train_X))], PAD)
        acc, loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                                      feed_dict={model.X:batch_x,
                                                model.Y:batch_y})
        total_loss += loss
        total_accuracy += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc='test minibatch loop')
    for k in pbar:
        batch_x, _ = pad_sentence_batch(test_X[k: min(k+batch_size,len(test_X))], PAD)
        batch_y, _ = pad_sentence_batch(test_Y[k: min(k+batch_size,len(test_X))], PAD)
        acc, loss = sess.run([model.accuracy, model.cost], 
                                      feed_dict={model.X:batch_x,
                                                model.Y:batch_y})
        total_loss_test += loss
        total_accuracy_test += acc
        pbar.set_postfix(cost=loss, accuracy = acc)
        
    total_loss /= (len(train_X) / batch_size)
    total_accuracy /= (len(train_X) / batch_size)
    total_loss_test /= (len(test_X) / batch_size)
    total_accuracy_test /= (len(test_X) / batch_size)
        
    print('epoch: %d, avg loss: %f, avg accuracy: %f'%(EPOCH, total_loss, total_accuracy))
    print('epoch: %d, avg loss test: %f, avg accuracy test: %f'%(EPOCH, total_loss_test, total_accuracy_test))

train minibatch loop: 100%|██████████| 261/261 [00:50<00:00,  4.80it/s, accuracy=0.107, cost=7.3]  
test minibatch loop: 100%|██████████| 14/14 [00:02<00:00,  5.16it/s, accuracy=0.119, cost=6.92]
train minibatch loop:   0%|          | 1/261 [00:00<00:38,  6.67it/s, accuracy=0.136, cost=6.18]

epoch: 0, avg loss: 7.555350, avg accuracy: 0.115685
epoch: 0, avg loss test: 7.388109, avg accuracy test: 0.125205


train minibatch loop: 100%|██████████| 261/261 [00:42<00:00,  4.88it/s, accuracy=0.142, cost=6.35]
test minibatch loop: 100%|██████████| 14/14 [00:01<00:00, 12.58it/s, accuracy=0.135, cost=6.58]
train minibatch loop:   0%|          | 1/261 [00:00<00:42,  6.12it/s, accuracy=0.179, cost=5.33]

epoch: 1, avg loss: 6.336922, avg accuracy: 0.137254
epoch: 1, avg loss test: 7.359129, avg accuracy test: 0.134677


train minibatch loop: 100%|██████████| 261/261 [00:42<00:00,  4.89it/s, accuracy=0.156, cost=5.53]
test minibatch loop: 100%|██████████| 14/14 [00:01<00:00, 12.38it/s, accuracy=0.124, cost=7.51]
train minibatch loop:   0%|          | 1/261 [00:00<00:34,  7.45it/s, accuracy=0.232, cost=4.51]

epoch: 2, avg loss: 5.481218, avg accuracy: 0.156632
epoch: 2, avg loss test: 7.638296, avg accuracy test: 0.139878


train minibatch loop: 100%|██████████| 261/261 [00:42<00:00,  4.91it/s, accuracy=0.213, cost=4.91]
test minibatch loop: 100%|██████████| 14/14 [00:01<00:00, 12.84it/s, accuracy=0.136, cost=8.32]
train minibatch loop:   0%|          | 1/261 [00:00<00:37,  6.92it/s, accuracy=0.427, cost=2.97]

epoch: 3, avg loss: 4.461336, avg accuracy: 0.220716
epoch: 3, avg loss test: 8.149182, avg accuracy test: 0.129966


train minibatch loop: 100%|██████████| 261/261 [00:42<00:00,  4.45it/s, accuracy=0.302, cost=3.87]
test minibatch loop: 100%|██████████| 14/14 [00:01<00:00, 13.83it/s, accuracy=0.145, cost=8.05]
train minibatch loop:   0%|          | 1/261 [00:00<00:37,  6.89it/s, accuracy=0.533, cost=2.42]

epoch: 4, avg loss: 3.430040, avg accuracy: 0.337620
epoch: 4, avg loss test: 8.745578, avg accuracy test: 0.121561


train minibatch loop: 100%|██████████| 261/261 [00:41<00:00,  5.25it/s, accuracy=0.416, cost=3]   
test minibatch loop: 100%|██████████| 14/14 [00:01<00:00, 12.58it/s, accuracy=0.0876, cost=9.74]
train minibatch loop:   0%|          | 1/261 [00:00<00:46,  5.57it/s, accuracy=0.726, cost=1.33]

epoch: 5, avg loss: 2.539967, avg accuracy: 0.464034
epoch: 5, avg loss test: 9.646397, avg accuracy test: 0.102522


train minibatch loop: 100%|██████████| 261/261 [00:42<00:00,  4.94it/s, accuracy=0.519, cost=2.06]
test minibatch loop: 100%|██████████| 14/14 [00:01<00:00, 13.78it/s, accuracy=0.101, cost=9.13] 
train minibatch loop:   0%|          | 1/261 [00:00<00:44,  5.80it/s, accuracy=0.72, cost=1.23]

epoch: 6, avg loss: 1.788288, avg accuracy: 0.590552
epoch: 6, avg loss test: 10.409043, avg accuracy test: 0.103095


train minibatch loop: 100%|██████████| 261/261 [00:42<00:00,  4.99it/s, accuracy=0.646, cost=1.53] 
test minibatch loop: 100%|██████████| 14/14 [00:01<00:00, 13.53it/s, accuracy=0.0859, cost=11]  
train minibatch loop:   0%|          | 1/261 [00:00<00:47,  5.48it/s, accuracy=0.833, cost=0.662]

epoch: 7, avg loss: 1.217691, avg accuracy: 0.703954
epoch: 7, avg loss test: 11.339424, avg accuracy test: 0.109497


train minibatch loop: 100%|██████████| 261/261 [00:42<00:00,  5.50it/s, accuracy=0.599, cost=1.47] 
test minibatch loop: 100%|██████████| 14/14 [00:01<00:00, 12.60it/s, accuracy=0.088, cost=12.3] 
train minibatch loop:   0%|          | 1/261 [00:00<00:34,  7.52it/s, accuracy=0.864, cost=0.515]

epoch: 8, avg loss: 0.820378, avg accuracy: 0.790957
epoch: 8, avg loss test: 12.057604, avg accuracy test: 0.106906


train minibatch loop: 100%|██████████| 261/261 [00:42<00:00,  5.10it/s, accuracy=0.85, cost=0.62]  
test minibatch loop: 100%|██████████| 14/14 [00:01<00:00, 13.61it/s, accuracy=0.0846, cost=13.2]

epoch: 9, avg loss: 0.542591, avg accuracy: 0.855785
epoch: 9, avg loss test: 12.876972, avg accuracy test: 0.108851





In [18]:
generated = [rev_dictionary[i] for i in sess.run(model.generate, feed_dict = {model.X: [test_X[0]]})[0,0,:]]
' '.join(generated)

'GO modi girl schools at grounds EOS PAD PAD PAD PAD PAD'

In [19]:
' '.join([rev_dictionary[i] for i in test_Y[0]])

'du may introduce entrance tests for ba comma UNK courses EOS'