In [1]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/summary/results-semi-0.json
# !wget https://f000.backblazeb2.com/file/malay-dataset/summary/results-semi-1.json
# !wget https://f000.backblazeb2.com/file/malay-dataset/summary/results-semi-2.json

In [2]:
# !pip3 install malaya
# !wget https://raw.githubusercontent.com/huseinzol05/Malaya/master/malaya/text/bahasa/lapor.py
from lapor import lapor as _lapor_words
_lapor_words.append('lapor Afp')

In [3]:
import re

def _get_ngrams(n, text):
    ngram_set = set()
    text_length = len(text)
    max_index_ngram_start = text_length - n
    for i in range(max_index_ngram_start + 1):
        ngram_set.add(tuple(text[i : i + n]))
    return ngram_set


def _get_word_ngrams(n, sentences):
    assert len(sentences) > 0
    assert n > 0

    words = sum(sentences, [])
    return _get_ngrams(n, words)


def cal_rouge(evaluated_ngrams, reference_ngrams):
    reference_count = len(reference_ngrams)
    evaluated_count = len(evaluated_ngrams)

    overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
    overlapping_count = len(overlapping_ngrams)

    if evaluated_count == 0:
        precision = 0.0
    else:
        precision = overlapping_count / evaluated_count

    if reference_count == 0:
        recall = 0.0
    else:
        recall = overlapping_count / reference_count

    f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
    return {'f': f1_score, 'p': precision, 'r': recall}

def _rouge_clean(s):
    s = re.sub(r'[^a-zA-Z0-9 ]', '', s)
    return re.sub(r'[ ]+', ' ', s).strip().lower()

In [4]:
import malaya
# model = malaya.true_case.transformer()

In [5]:
import json

with open('results-semi-0.json') as fopen:
    data = json.load(fopen)
    
with open('results-semi-1.json') as fopen:
    data += json.load(fopen)
    
with open('results-semi-2.json') as fopen:
    data += json.load(fopen)

In [6]:
import itertools
merged = list(itertools.chain(*data))

['pukul 09.', '00am.']

In [77]:
from malaya.text.function import split_into_sentences

def filter_rouge(article, summary, n = 2, threshold = 0.1, **kwargs):
    sents = split_into_sentences(summary)
    reference = _get_word_ngrams(n, [_rouge_clean(article).split()])
    results = []
    for s in sents:
        if len(s) < 15:
            results.append(s)
        else:
            evaluated = _get_word_ngrams(n, [_rouge_clean(s).split()])
            score = cal_rouge(evaluated, reference)['p']
            if score >= threshold:
                results.append(s)
    return ' '.join(results)

In [103]:
from unidecode import unidecode

def postprocessing_summarization(string, lapors = _lapor_words):
    for l in lapors:
        if l in string:
            string = re.sub(f'\s*[,.]?\s*{l}', ' ', string)

    string = re.sub(r'[ ]+', ' ', string).strip()
    return unidecode(string)

In [79]:
import itertools

def _pad_sequence(
    sequence,
    n,
    pad_left = False,
    pad_right = False,
    left_pad_symbol = None,
    right_pad_symbol = None,
):
    sequence = iter(sequence)
    if pad_left:
        sequence = itertools.chain((left_pad_symbol,) * (n - 1), sequence)
    if pad_right:
        sequence = itertools.chain(sequence, (right_pad_symbol,) * (n - 1))
    return sequence

def ngrams(
    sequence,
    n: int,
    pad_left = False,
    pad_right = False,
    left_pad_symbol = None,
    right_pad_symbol = None,
):
    sequence = _pad_sequence(
        sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol
    )

    history = []
    while n > 1:
        try:
            next_item = next(sequence)
        except StopIteration:
            return
        history.append(next_item)
        n -= 1
    for item in sequence:
        history.append(item)
        yield tuple(history)
        del history[0]

In [104]:
def find_lapor_and_remove(article, summary):
    lapor = []
    lowered = article.lower()
    finds = re.findall('\w*lapor \w*', summary)
    for f in finds:
        start = summary.find(f)
        end = summary.find('.', start)
        s = summary[start: end].split(',')
        s = s[0].split(';')
        s = s[0].split(':')
        s = s[0].split('-')
        if len(s[0].split()) < 8:
            a = s[0].replace('lapor ', '').lower().split()
            ngram = list(ngrams(lowered.split(), len(a)))
            if a not in ngram:
                lapor.append(s[0])
                
    summary = postprocessing_summarization(summary, lapor)
    return summary

In [106]:
from tqdm import tqdm

results = []
for i in tqdm(range(len(merged))):
    f = filter_rouge(merged[i][0], merged[i][1])
    f = postprocessing_summarization(f)
    f = find_lapor_and_remove(merged[i][0], f)
    results.append([merged[i][0], f])

100%|██████████| 107472/107472 [03:50<00:00, 465.49it/s]


In [108]:
with open('filtered-100k-semisupervised-summary.json', 'w') as fopen:
    json.dump(results, fopen)