In [1]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/summarization/dailymail/translated-dailymail-train.json
# !wget https://f000.backblazeb2.com/file/malay-dataset/summarization/cnn-news/translated-cnn-train.json

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin
import sentencepiece as spm
from glob import glob
import json
import os

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.ms-en.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

True

In [3]:
from tqdm import tqdm
import re

def cleaning(string):
    string = string.replace('\n', ' ').replace('\t', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [4]:
files = ['/home/husein/news/translated-dailymail-train.json',
        '/home/husein/news/translated-cnn-train.json']

In [5]:
with open(files[1]) as fopen:
    data = json.load(fopen)

In [9]:
' '.join(data[0]['ms_abstract'])

'Lebih 200 kilogram coklat dimakan di Cafe Fleuri\'s buffet Chocolate Bar setiap Sabtu. Coklat dipaparkan dalam setiap kursus makan malam dan sarapan pagi untuk pakej indulgensi coklat Three Way House Hotel. The Hotel Hershey of Hersheypark menjadi tuan rumah kepada pemburuan telur Easter tahunan yang dikatakan "legenda".'

In [10]:
with tf.io.gfile.GFile('summarization.tsv', "w") as outfile:
    for file in files:
        print(file)
        with open(file) as fopen:
            data = json.load(fopen)
        for i in tqdm(range(len(data))):
            l = ' '.join(data[i]['ms_article'])
            r = ' '.join(data[i]['ms_abstract'])
            outfile.write("%s\t%s\n" % (cleaning(l), cleaning(r)))

/home/husein/news/translated-dailymail-train.json


100%|██████████| 209506/209506 [01:08<00:00, 3061.87it/s]


/home/husein/news/translated-cnn-train.json


100%|██████████| 90579/90579 [00:28<00:00, 3153.04it/s]


In [11]:
def summarization_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'summarization.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds

def summarization_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['ringkasan: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )

In [12]:
t5.data.TaskRegistry.remove('summarization_dataset')
t5.data.TaskRegistry.add(
    'summarization_dataset',
    dataset_fn = summarization_dataset,
    splits = ['train'],
    text_preprocessor = [summarization_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)


In [13]:
nq_task = t5.data.TaskRegistry.get("summarization_dataset")
ds = nq_task.get_dataset(split='summarization.tsv', sequence_length={"inputs": 1024, "targets": 1024})
r = tfds.as_numpy(ds)

In [14]:
next(r)

{'inputs_plaintext': b"ringkasan: Oleh . Bukit Suzannah. PENERBITAN: . 02:56 EST, 18 September 2013. |. DIKEMASKINI: . 04:19 EST, 18 September 2013. Penjara: Robert McLaughlin, 52, dipenjara 16 minggu selepas tersepit di belakang roda lima kali melebihi had dan berpakaian sebagai WPC. Seorang pemandu minuman yang ditemui tergelincir di belakang roda ketika berpakaian seragam pegawai polis wanita telah dipenjarakan. Robert McLaughlin, 52, was five times the drink-drive limit when he was discovered at the wheel of a car wearing a female police top, black trousers and boots, a court heard. Dia juga mempunyai alat dengar komunikasi, dengan wayar masuk ke kolarnya, dan topi keledar polis yang menjadi milik anak tirinya di tempat duduk penumpang. Peguamnya memberitahu mahkamah bahawa dia telah menjalin hubungan dengan pemilik pakaian seragam itu dan mereka akan 'sesekali berpakaian' dalam gearnya. Majistret Wakefield mendengar seorang penunggang basikal hampir dirobohkan oleh sebuah kereta y