In [1]:
import os
import re
import tensorflow as tf
import numpy as np
import json

In [2]:
labels = os.listdir('news')
news = ['news/' + i for i in labels if '.json' in i]
labels = [i.replace('.json','') for i in labels]
len(news)

123

In [3]:
with open(news[0]) as fopen:
    first_news = json.load(fopen)

In [4]:
first_news[0]['text']

'A+ A-\n\nKetua Umum PKR Datuk Seri Anwar Ibrahim membidas kenyataan anggota Majlis Pimpinan Tertinggi BERSATU Datuk Abdul Kadir Jasin berhubung peruntukan kerajaan kepada Yang di-Pertuan Agong (YDPA).\n\nBeliau berkata walaupun hak untuk menyuarakan pandangan perlu dihormati, wartawan veteran itu perlu menunjukkan penghormatan dan ketertiban (decorum) dalam hal ini.\n\nKatanya, kritikan itu juga dibuat tanpa memberi peluang kepada Institusi Raja-raja Melayu untuk memberi respons.\n\n"Sekiranya buat kritikan sebegini tanpa memberi peluang kepada Raja-Raja Melayu untuk jelaskan jawapan, ia tidak sihat terutamanya menggunakan kedudukan yang ada.\n\n"Saya juga lihat sejumlah penekanan, dan ada fakta (yang) dipertikai, saya rasa kita perlu lebih berwaspada," katanya pada sidang media selepas pertemuan dengan wakil rakyat PKR di sebuah hotel di Selangor, hari ini.\n\nBeliau diminta mengulas artikel Abdul Kadir yang juga ketua media dan komunikasi Majlis Penasihat Kerajaan di blognya hari in

In [5]:
import malaya

In [6]:
tokenizer = malaya.preprocessing._SocialTokenizer().tokenize

In [7]:
accept_tokens = ',-.()"\''

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(string)
    tokenized = [w.lower() for w in tokenized if len(w) > 1 or w in accept_tokens]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

def clean_label(label):
    string = re.sub('[^A-Za-z\- ]+', ' ', label)
    return re.sub(r'[ ]+', ' ', string.lower()).strip()

In [8]:
labels = [clean_label(label) for label in labels]

In [9]:
from sklearn.utils import shuffle

maxlen = 150
min_len = 20

x, y = [], []
for no, n in enumerate(news):
    with open(n) as fopen: 
        news_ = json.load(fopen)
    for row in news_:
        if len(row['text'].split()) > min_len:
            p = preprocessing(row['text'])
            p = p[:maxlen]
            x.append(p)
            y.append(labels[no])
            
x, y = shuffle(x, y)

In [10]:
len(x), len(y)

(14471, 14471)

In [11]:
import collections

def build_dataset(words, n_words, atleast=2):
    count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    counter = [i for i in counter if i[1] >= atleast]
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [12]:
import itertools

concat = list(itertools.chain(*x)) + ' '.join(labels).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])
print('filtered vocab size:',len(dictionary))
print("% of vocab used: {}%".format(round(len(dictionary)/vocabulary_size,4)*100))

vocab from size: 63032
Most common words [(',', 90498), ('.', 80674), ('yang', 32102), ('-', 29965), ('the', 29732), ('dan', 28171)]
Sample data [1659, 5, 237, 7, 7, 756, 716, 594, 1239, 12] ['tribunnews', '.', 'com', '-', '-', 'pt', 'xl', 'axiata', 'tbk', '(']
filtered vocab size: 38621
% of vocab used: 61.27%


In [13]:
for i in range(len(x)):
    x.append('EOS')

In [14]:
GO = dictionary['GO']
PAD = dictionary['PAD']
EOS = dictionary['EOS']
UNK = dictionary['UNK']

In [15]:
class Generator:
    def __init__(self, size_layer, num_layers, embedded_size, 
                 dict_size, learning_rate, beam_width = 5):
        
        def lstm_cell(size, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size, initializer=tf.orthogonal_initializer(),
                                           reuse=reuse)
        
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        batch_size = tf.shape(self.X)[0]
        
        embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
        topic_embedded = tf.nn.embedding_lookup(embeddings, self.X)
        topic_average = tf.reduce_mean(topic_embedded, axis=1)
        topic_state = tf.layers.dense(topic_average, size_layer)
        
        lstm_state = tf.nn.rnn_cell.LSTMStateTuple(c=topic_state, h=topic_state)
        self.encoder_state = tuple([lstm_state] * num_layers)
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        
        decoder_cells = tf.nn.rnn_cell.MultiRNNCell([lstm_cell(size_layer) for _ in range(num_layers)])
        dense_layer = tf.layers.Dense(dict_size)
        
        training_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs = tf.nn.embedding_lookup(embeddings, decoder_input),
                sequence_length = self.Y_seq_len,
                time_major = False)
        training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cells,
                helper = training_helper,
                initial_state = self.encoder_state,
                output_layer = dense_layer)
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = training_decoder,
                impute_finished = True,
                maximum_iterations = tf.reduce_max(self.Y_seq_len))
        
        predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                cell = decoder_cells,
                embedding = embeddings,
                start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                end_token = EOS,
                initial_state = tf.contrib.seq2seq.tile_batch(self.encoder_state, beam_width),
                beam_width = beam_width,
                output_layer = dense_layer,
                length_penalty_weight = 0.0)
        
        predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = predicting_decoder,
                impute_finished = False,
                maximum_iterations = maxlen)
        
        self.training_logits = training_decoder_output.rnn_output
        self.predicting_ids = predicting_decoder_output.predicted_ids[:, :, 0]
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [16]:
size_layer = 256
num_layers = 2
embedded_size = 128
learning_rate = 0.001
batch_size = 8
epoch = 20

In [17]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Generator(size_layer, num_layers, embedded_size, len(dictionary), 
                learning_rate)
sess.run(tf.global_variables_initializer())

In [18]:
def str_idx(corpus, dic):
    X = []
    for i in corpus:
        ints = []
        for k in i:
            ints.append(dic.get(k,UNK))
        X.append(ints)
    return X

In [19]:
y = [i.split() for i in y]

In [20]:
X = str_idx(y, dictionary)
Y = str_idx(x, dictionary)

In [21]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [22]:
from tqdm import tqdm

In [24]:
for i in range(epoch):
    total_loss, total_accuracy = 0, 0
    pbar = tqdm(
        range(0, len(X), batch_size), desc = 'minibatch loop')
    for k in pbar:
        index = min(k+batch_size, len(X))
        batch_x, seq_x = pad_sentence_batch(X[k: index], PAD)
        batch_y, seq_y = pad_sentence_batch(Y[k: index], PAD)
        predicted, accuracy,loss, _ = sess.run([model.predicting_ids, 
                                                model.accuracy, model.cost, model.optimizer], 
                                      feed_dict={model.X:batch_x,
                                                model.Y:batch_y})
        total_loss += loss
        total_accuracy += accuracy
        pbar.set_postfix(cost = loss, accuracy = accuracy)
    total_loss /= (len(X) / batch_size)
    total_accuracy /= (len(X) / batch_size)
    print('epoch: %d, avg loss: %f, avg accuracy: %f'%(i+1, total_loss, total_accuracy))

minibatch loop: 100%|██████████| 1809/1809 [35:10<00:00,  1.15s/it, accuracy=0.131, cost=6.35] 
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 1, avg loss: 7.127246, avg accuracy: 0.074340


minibatch loop: 100%|██████████| 1809/1809 [35:12<00:00,  1.16s/it, accuracy=0.175, cost=5.52]
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 2, avg loss: 5.861953, avg accuracy: 0.156625


minibatch loop: 100%|██████████| 1809/1809 [35:13<00:00,  1.16s/it, accuracy=0.192, cost=5.09]
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 3, avg loss: 5.299645, avg accuracy: 0.193236


minibatch loop: 100%|██████████| 1809/1809 [35:15<00:00,  1.16s/it, accuracy=0.218, cost=4.77]
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 4, avg loss: 4.936064, avg accuracy: 0.215528


minibatch loop: 100%|██████████| 1809/1809 [35:15<00:00,  1.16s/it, accuracy=0.233, cost=4.51]
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 5, avg loss: 4.655775, avg accuracy: 0.233165


minibatch loop: 100%|██████████| 1809/1809 [35:15<00:00,  1.16s/it, accuracy=0.254, cost=4.29]
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 6, avg loss: 4.428398, avg accuracy: 0.249557


minibatch loop: 100%|██████████| 1809/1809 [35:14<00:00,  1.16s/it, accuracy=0.26, cost=4.1]  
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 7, avg loss: 4.241566, avg accuracy: 0.264621


minibatch loop: 100%|██████████| 1809/1809 [35:14<00:00,  1.16s/it, accuracy=0.272, cost=3.94]
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 8, avg loss: 4.083615, avg accuracy: 0.279595


minibatch loop: 100%|██████████| 1809/1809 [35:15<00:00,  1.16s/it, accuracy=0.297, cost=3.81]
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 9, avg loss: 3.948315, avg accuracy: 0.294211


minibatch loop: 100%|██████████| 1809/1809 [35:15<00:00,  1.16s/it, accuracy=0.307, cost=3.69]
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 10, avg loss: 3.830216, avg accuracy: 0.307300


minibatch loop: 100%|██████████| 1809/1809 [35:15<00:00,  1.16s/it, accuracy=0.328, cost=3.58]
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 11, avg loss: 3.725735, avg accuracy: 0.319824


minibatch loop: 100%|██████████| 1809/1809 [35:14<00:00,  1.16s/it, accuracy=0.337, cost=3.48]
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 12, avg loss: 3.631060, avg accuracy: 0.331617


minibatch loop: 100%|██████████| 1809/1809 [35:14<00:00,  1.16s/it, accuracy=0.347, cost=3.38]
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 13, avg loss: 3.543331, avg accuracy: 0.342568


minibatch loop: 100%|██████████| 1809/1809 [35:14<00:00,  1.16s/it, accuracy=0.366, cost=3.28]
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 14, avg loss: 3.462204, avg accuracy: 0.353233


minibatch loop: 100%|██████████| 1809/1809 [35:14<00:00,  1.16s/it, accuracy=0.371, cost=3.2] 
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 15, avg loss: 3.387544, avg accuracy: 0.363161


minibatch loop: 100%|██████████| 1809/1809 [35:15<00:00,  1.16s/it, accuracy=0.38, cost=3.15] 
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 16, avg loss: 3.317969, avg accuracy: 0.372744


minibatch loop: 100%|██████████| 1809/1809 [35:14<00:00,  1.16s/it, accuracy=0.401, cost=3.09]
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 17, avg loss: 3.251911, avg accuracy: 0.381816


minibatch loop: 100%|██████████| 1809/1809 [35:14<00:00,  1.15s/it, accuracy=0.404, cost=3.02]
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 18, avg loss: 3.189790, avg accuracy: 0.390488


minibatch loop: 100%|██████████| 1809/1809 [35:14<00:00,  1.16s/it, accuracy=0.42, cost=2.96] 
minibatch loop:   0%|          | 0/1809 [00:00<?, ?it/s]

epoch: 19, avg loss: 3.130544, avg accuracy: 0.398999


minibatch loop: 100%|██████████| 1809/1809 [35:14<00:00,  1.16s/it, accuracy=0.426, cost=2.91]

epoch: 20, avg loss: 3.074743, avg accuracy: 0.406972





In [25]:
test_topic = 'isu najib razak mahathir'

In [26]:
batch_x, _ = pad_sentence_batch(X[:1], PAD)

In [27]:
test_topic_idx = str_idx([test_topic.split()], dictionary)
batch_test, _ = pad_sentence_batch(test_topic_idx, PAD)
batch_test

[[28, 134, 253, 112]]

In [28]:
predict_test = sess.run(model.predicting_ids, feed_dict = {model.X: batch_test})[0]
' '.join([rev_dictionary[i] for i in predict_test])

'looks like javascript is disabled in your browser . malaysiakini requires javascript to run normally . click here to enable javascript in your browser . malaysiakini requires javascript to run normally . click here to enable javascript in your browser . malaysiakini requires javascript to run normally . click here to enable javascript in your browser . malaysiakini requires javascript to run normally . click here to enable javascript in your browser . click here to enable portfolio baharu , tan sri dr . wan azizah wan ismail sebagai pengerusi pakatan harapan , tun dr . mahathir mohamad dan naib presiden pkr , datuk seri dr . wan azizah wan ismail sebagai pengerusi pakatan harapan , tun dr . mahathir mohamad dan naib presiden pkr , datuk seri dr . wan azizah wan ismail sebagai timbalan perdana menteri , tan sri dr . wan azizah wan ismail sebagai timbalan perdana'