In [1]:
import os
import re
import tensorflow as tf
import numpy as np
import json

In [2]:
from tensorflow.nn.rnn_cell import RNNCell, MultiRNNCell

In [3]:
labels = os.listdir('news')
news = ['news/' + i for i in labels if '.json' in i]
labels = [i.replace('.json','') for i in labels]
len(news)

123

In [4]:
import malaya

In [5]:
tokenizer = malaya.preprocessing._SocialTokenizer().tokenize

In [6]:
accept_tokens = ',-.()"\''

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(string)
    tokenized = [w.lower() for w in tokenized if len(w) > 1 or w in accept_tokens]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

def clean_label(label):
    string = re.sub('[^A-Za-z\- ]+', ' ', label)
    return re.sub(r'[ ]+', ' ', string.lower()).strip()

In [7]:
labels = [clean_label(label) for label in labels]

In [8]:
from sklearn.utils import shuffle

maxlen = 150
maxlen_topic = 10
min_len = 20

x, y = [], []
for no, n in enumerate(news):
    with open(n) as fopen: 
        news_ = json.load(fopen)
    for row in news_:
        if len(row['text'].split()) > min_len:
            p = preprocessing(row['text'])
            p = p[:maxlen]
            x.append(p)
            y.append(labels[no])
            
x, y = shuffle(x, y)

In [9]:
len(x), len(y)

(14471, 14471)

In [10]:
import collections

def build_dataset(words, n_words, atleast=2):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [11]:
import itertools

concat = list(itertools.chain(*x)) + ' '.join(labels).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])
print('filtered vocab size:',len(dictionary))
print("% of vocab used: {}%".format(round(len(dictionary)/vocabulary_size,4)*100))

vocab from size: 63032
Most common words [(',', 90498), ('.', 80674), ('yang', 32102), ('-', 29965), ('the', 29732), ('dan', 28171)]
Sample data [255, 24344, 458, 8433, 131, 5, 400, 2310, 522, 1142] ['produk', 'albothyl', 'tengah', 'meresahkan', 'masyarakat', '.', 'karena', 'mengandung', 'bahan', 'berbahaya']
filtered vocab size: 38621
% of vocab used: 61.27%


In [12]:
for i in range(len(x)):
    x.append('EOS')

In [13]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [14]:
from tensorflow.python.ops import variable_scope as vs
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import gen_array_ops
from tensorflow.python.layers import core as layers_core
import tensorflow as tf


class MTAWrapper(RNNCell):
    def __init__(self, cell, memory, v, uf, query_layer, memory_layer, mask=None, 
                 max_len=100, attention_size=128, state_is_tuple=True
                 ):
        if not isinstance(cell, RNNCell):
            raise TypeError("The parameter cell is not RNNCell.")
        self._cell = cell
        self.memory = memory
        self._state_is_tuple = state_is_tuple
        self.attention_size = attention_size
        
        self.batch_size = tf.shape(self.memory)[0]
        self.num_keywords = tf.shape(self.memory)[1]
        self.embedding_size = tf.shape(self.memory)[2]

        self.coverage_vector = tf.fill([self.batch_size, self.num_keywords], 1.0)
        
        if mask is None:
            self.seq_len = tf.fill([self.batch_size, 1], 1.0)  # inference
        else:
            self.seq_len = math_ops.reduce_sum(mask, axis=1, keepdims=True)  # training

        self.v = v
        self.query_layer = query_layer

        self.memory_layer = memory_layer
        self.u_f = uf
        res1 = tf.sigmoid(
            tf.matmul(tf.reshape(self.memory, [self.batch_size, -1]), self.u_f))
        self.phi_res = self.seq_len * res1
    @property
    def state_size(self):
        return self._cell.state_size

    @property
    def output_size(self):
        return self._cell.output_size

    def __call__(self, inputs, state, scope=None):
        c_t, h_t = state
        dtype = inputs.dtype

        with vs.variable_scope("topic_attention"):
            keys = self.memory_layer(self.memory)
            processed_query = array_ops.expand_dims(self.query_layer(h_t), 1) 
            score = self.coverage_vector * math_ops.reduce_sum(self.v * math_ops.tanh(keys + processed_query), [2])
            score = nn_ops.softmax(score, axis=1)
            score_tile = gen_array_ops.tile(array_ops.expand_dims(score, -1), [1, 1, self.embedding_size],
                                            name="weight")
            mt = math_ops.reduce_sum(self.memory * score_tile, axis=1)
            self.coverage_vector = self.coverage_vector - score / self.phi_res
        return self._cell(tf.concat([inputs, mt], axis=1), state)

In [15]:
class Generator:
    def __init__(self, size_layer, num_layers, embedded_size, 
                 dict_size, learning_rate, beam_width = 5):
        
        def lstm_cell(size, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size, initializer=tf.orthogonal_initializer(),
                                           reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        topic_embedded = tf.nn.embedding_lookup(embeddings, self.X)
        topic_average = tf.reduce_mean(topic_embedded, axis=1)
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        
        decoder_cells = lstm_cell(size_layer)
        self.decoder_cells = decoder_cells
        dense_layer = tf.layers.Dense(dict_size)
        
        self.encoder_state = decoder_cells.zero_state(batch_size=batch_size,
                                                                  dtype=tf.float32)
        
        self.v = tf.get_variable("attention_v", [size_layer])
        self.query_layer = tf.layers.Dense(size_layer)
        self.memory_layer = tf.layers.Dense(size_layer)
        self.uf = tf.get_variable("u_f", [maxlen_topic * embedded_size, maxlen_topic])
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        
        with tf.variable_scope('decode'):
            decoder_cells = MTAWrapper(self.decoder_cells, topic_embedded,
                                       self.v, self.uf, self.query_layer, self.memory_layer, mask=masks,
                                      max_len = maxlen)

            training_helper = tf.contrib.seq2seq.TrainingHelper(
                    inputs = tf.nn.embedding_lookup(embeddings, decoder_input),
                    sequence_length = self.Y_seq_len,
                    time_major = False)

            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell = decoder_cells,
                    helper = training_helper,
                    initial_state = self.encoder_state,
                    output_layer = dense_layer)
            training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder = training_decoder,
                    impute_finished = True,
                    maximum_iterations = tf.reduce_max(self.Y_seq_len))
        
        with tf.variable_scope('decode', reuse=True):
            tiled_uf = tf.contrib.seq2seq.tile_batch(self.uf, beam_width)
            tiled_topic_embedded = tf.contrib.seq2seq.tile_batch(topic_embedded, beam_width)
            decoder_cells = MTAWrapper(self.decoder_cells, tiled_topic_embedded,
                                       self.v, tiled_uf, self.query_layer, self.memory_layer,
                                      max_len = maxlen)
        
            predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell = decoder_cells,
                    embedding = embeddings,
                    start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                    end_token = EOS,
                    initial_state = tf.contrib.seq2seq.tile_batch(self.encoder_state, beam_width),
                    beam_width = beam_width,
                    output_layer = dense_layer,
                    length_penalty_weight = 0.0)
        
            predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder = predicting_decoder,
                    impute_finished = False,
                    maximum_iterations = maxlen)
        
        self.training_logits = training_decoder_output.rnn_output
        self.predicting_ids = predicting_decoder_output.predicted_ids[:, :, 0]
        
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [16]:
size_layer = 256
num_layers = 2
embedded_size = 128
learning_rate = 0.001
batch_size = 32
epoch = 20

In [17]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Generator(size_layer, num_layers, embedded_size, len(dictionary), 
                learning_rate)
sess.run(tf.global_variables_initializer())

In [18]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i:
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

In [19]:
y = [i.split() for i in y]

In [20]:
X = str_idx(y, dictionary)
Y = str_idx(x, dictionary)

In [21]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

def pad_sentence_batch_static(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = maxlen
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(maxlen)
    return padded_seqs, seq_lens

In [22]:
batch_x, _ = pad_sentence_batch(X[:1], PAD)
sess.run(model.predicting_ids, feed_dict = {model.X: batch_x})

array([[11385,  1941, 17886, 17886, 17886, 17886, 17886, 17886, 17886,
        29260, 31328, 12976, 17886, 17886, 12976, 17886,  8681, 27175,
        17886, 17886, 17886, 17886, 12976,  8681, 27175, 11988, 24553,
        11988, 11988, 24553, 11988, 24553, 11988, 23563, 15066, 36818,
        36818, 36818,  6224, 16241, 11988, 11988, 11988, 11988, 11988,
         6224, 11988, 11988, 11988,  6579,  6579, 37130, 11988, 26193,
        26193, 26193, 37130, 26193, 37130, 37130, 11988, 11988, 11988,
         6224, 11988, 11988, 11988,  6224, 11988,  8826, 11988,  8826,
        37552, 36818, 36818, 36818, 30803, 30803,  6224, 34199, 36818,
        36818, 36818, 36818, 30803,  6224,  6224,  6765,  6765,  6224,
         6765,  6765,  6765,  6765,  5278,  1304, 10806, 29149,  9831,
         9831,  9831,  9831,  9831,  9831, 30803, 30803, 30803, 30803,
         6230, 29149, 30803, 30803,  6230, 29149, 30803, 10806, 29149,
        10806, 29149,  9831,  9831,  9831,  9831, 30803, 30803, 30803,
      

In [23]:
from tqdm import tqdm

In [24]:
for i in range(epoch):
    total_loss, total_accuracy = 0, 0
    pbar = tqdm(
        range(0, len(X), batch_size), desc = 'minibatch loop')
    for k in pbar:
        index = min(k+batch_size, len(X))
        batch_x, seq_x = pad_sentence_batch(X[k: index], PAD)
        batch_y, seq_y = pad_sentence_batch(Y[k: index], PAD)
        accuracy,loss, _ = sess.run([model.accuracy, model.cost, model.optimizer], 
                                      feed_dict={model.X:batch_x,
                                                model.Y:batch_y})
        total_loss += loss
        total_accuracy += accuracy
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    total_loss /= (len(X) / batch_size)
    total_accuracy /= (len(X) / batch_size)
    print('epoch: %d, avg loss: %f, avg accuracy: %f'%(i+1, total_loss, total_accuracy))

minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.0962, cost=6.73]
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 1, avg loss: 7.429072, avg accuracy: 0.059118


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.135, cost=6.03] 
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 2, avg loss: 6.443972, avg accuracy: 0.114713


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.172, cost=5.5] 
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 3, avg loss: 5.930789, avg accuracy: 0.153914


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.204, cost=5.1] 
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 4, avg loss: 5.576925, avg accuracy: 0.179522


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.218, cost=4.78]
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 5, avg loss: 5.312454, avg accuracy: 0.196892


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.23, cost=4.5]  
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 6, avg loss: 5.097266, avg accuracy: 0.210725


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.25, cost=4.26] 
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 7, avg loss: 4.914798, avg accuracy: 0.222112


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.267, cost=4.04]
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 8, avg loss: 4.756433, avg accuracy: 0.232037


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.287, cost=3.83]
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 9, avg loss: 4.614790, avg accuracy: 0.241598


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.314, cost=3.64]
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 10, avg loss: 4.488174, avg accuracy: 0.250645


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.331, cost=3.47]
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 11, avg loss: 4.373607, avg accuracy: 0.259206


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.35, cost=3.31] 
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 12, avg loss: 4.270588, avg accuracy: 0.267425


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.377, cost=3.17]
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 13, avg loss: 4.175698, avg accuracy: 0.275535


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.00it/s, accuracy=0.399, cost=3.05]
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 14, avg loss: 4.087636, avg accuracy: 0.283630


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.426, cost=2.93]
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 15, avg loss: 4.007741, avg accuracy: 0.291590


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.441, cost=2.81]
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 16, avg loss: 3.930929, avg accuracy: 0.299675


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.465, cost=2.71]
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 17, avg loss: 3.859192, avg accuracy: 0.307621


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.479, cost=2.62]
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 18, avg loss: 3.793177, avg accuracy: 0.315043


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.494, cost=2.53]
minibatch loop:   0%|          | 0/453 [00:00<?, ?it/s]

epoch: 19, avg loss: 3.731221, avg accuracy: 0.322467


minibatch loop: 100%|██████████| 453/453 [08:03<00:00,  1.01it/s, accuracy=0.512, cost=2.45]

epoch: 20, avg loss: 3.672711, avg accuracy: 0.329646





In [25]:
test_topic = 'isu najib razak mahathir'

In [26]:
batch_x, _ = pad_sentence_batch(X[:1], PAD)

In [27]:
test_topic_idx = str_idx([test_topic.split()], dictionary)
batch_test, _ = pad_sentence_batch(test_topic_idx, PAD)
batch_test

[[28, 134, 253, 112]]

In [28]:
predict_test = sess.run(model.predicting_ids, feed_dict = {model.X: batch_test})[0]
' '.join([rev_dictionary[i] for i in predict_test])

'- ( ubah saiz teks ) kuala lumpur - perdana menteri , datuk seri najib tun razak hari ini , perdana menteri , datuk seri najib tun razak dan perdana menteri , datuk seri najib tun razak dan perdana menteri , datuk seri najib tun razak dan perdana menteri , datuk seri najib tun razak dan perdana menteri , datuk seri najib tun razak dan perdana menteri , datuk seri najib tun razak dan perdana menteri , datuk seri najib tun razak dan perdana menteri , datuk seri najib tun razak dan perdana menteri , datuk seri najib tun razak dan perdana menteri , datuk seri najib tun razak dan perdana menteri , datuk seri najib tun razak dan perdana menteri , datuk seri najib tun razak dan perdana menteri , datuk seri najib tun razak dan perdana menteri , datuk seri najib tun razak dan perdana menteri datuk seri'