In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import tensorflow as tf
import sentencepiece as spm

vocab = '/home/husein/b2b/sp10m.cased.t5.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

class Encoder:
    def __init__(self, sp):
        self.sp = sp
        self.vocab_size = sp.GetPieceSize() + 100
    
    def encode(self, s):
        return self.sp.EncodeAsIds(s)
    
    def decode(self, ids, strip_extraneous=False):
        return self.sp.DecodeIds(list(ids))
    
encoder = Encoder(sp)

In [3]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [4]:
g = load_graph('output/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
top_p = g.get_tensor_by_name('import/top_p:0')
temperature = g.get_tensor_by_name('import/temperature:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.Session(graph = g)

In [5]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/testset/test-set-cnn.json

In [6]:
import json

with open('/home/husein/b2b/test-set-cnn.json') as fopen:
    test = json.load(fopen)

In [7]:
import re
from unidecode import unidecode
from malaya.text.rules import normalized_chars

def filter_news(string):
    string = string.lower()
    return 'javascript is disabled' in string or 'requires javascript' in string or 'javascript' in string \
    or 'président' in string

def make_cleaning(s, c_dict):
    s = s.translate(c_dict)
    return s

def transformer_textcleaning(string):
    """
    use by any transformer model before tokenization
    """
    string = unidecode(string)
    string = ' '.join(
        [make_cleaning(w, normalized_chars) for w in string.split()]
    )
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    string = string.replace('\n', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']
    return ' '.join(string)

In [8]:
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

In [None]:
from tqdm import tqdm

batch_size = 10
results = []
for i in tqdm(range(0, len(test['X']), batch_size)):
    batch_x = test['X'][i: i + batch_size]
    batches = []
    for b in batch_x:
        encoded = encoder.encode(f'ringkasan: {transformer_textcleaning(b)}')
        encoded = encoded[:2047] + [1] 
        batches.append(encoded)
    batches = pad_sequences(batches, padding='post', maxlen = 2048)
    g = test_sess.run(logits, feed_dict = {x:batches, top_p: 0.0, temperature: 0.0})
    for b in g:
        results.append(encoder.decode(b.tolist()))

 14%|█▎        | 136/1000 [2:22:24<24:24:20, 101.69s/it]

In [18]:
from tensor2tensor.utils import rouge
from tensorflow.keras.preprocessing import sequence

In [20]:
import numpy as np

sp_model = spm.SentencePieceProcessor()
sp_model.Load(vocab)

def calculate_rouges(predicted, batch_y, n_size = 2):
    batch_y = [sp_model.EncodeAsIds(row) for row in batch_y]
    predicted = [sp_model.EncodeAsIds(row) for row in predicted]
    maxlen = max(max(len(row) for row in batch_y), max(len(row) for row in predicted))
    batch_y = sequence.pad_sequences(batch_y, padding = 'post', maxlen = maxlen)
    predicted = sequence.pad_sequences(predicted, padding = 'post', maxlen = maxlen)
    
    non = np.count_nonzero(batch_y, axis = 1)
    o = []
    for n in non:
        o.append([True for _ in range(n)])
    b = sequence.pad_sequences(o, dtype = np.bool, padding = 'post', value = False, maxlen = maxlen)
    rouges = []
    for i in range(b.shape[0]):
        a = batch_y[i][b[i]]
        p = predicted[i][b[i]]
        score = rouge.rouge_n([a], [p], n = n_size)
        rouges.append(score)
    return np.mean(rouges)

def calculate_rouge_l(predicted, batch_y):
    batch_y = [sp_model.EncodeAsIds(row) for row in batch_y]
    predicted = [sp_model.EncodeAsIds(row) for row in predicted]
    maxlen = max(max(len(row) for row in batch_y), max(len(row) for row in predicted))
    batch_y = sequence.pad_sequences(batch_y, padding = 'post', maxlen = maxlen)
    predicted = sequence.pad_sequences(predicted, padding = 'post', maxlen = maxlen)
    
    non = np.count_nonzero(batch_y, axis = 1)
    o = []
    for n in non:
        o.append([True for _ in range(n)])
    b = sequence.pad_sequences(o, dtype = np.bool, padding = 'post', value = False, maxlen = maxlen)
    rouges = []
    for i in range(b.shape[0]):
        a = batch_y[i][b[i]]
        p = predicted[i][b[i]]
        score = rouge.rouge_l_sentence_level([a], [p])
        rouges.append(score)
    return np.mean(rouges)

In [21]:
calculate_rouges(results, test['Y'][:len(results)], n_size = 1)

0.27545756

In [22]:
calculate_rouges(results, test['Y'][:len(results)], n_size = 2)

0.08543168

In [23]:
calculate_rouge_l(results, test['Y'][:len(results)])

0.18890981