In [1]:
import os

for i in range(3):
    url = f'https://f000.backblazeb2.com/file/malay-dataset/summary/results-semi-{i}.json'
    os.system(f'wget {url}')

In [2]:
from glob import glob

files = glob('results-semi-*.json')
files

['results-semi-2.json', 'results-semi-0.json', 'results-semi-1.json']

In [3]:
import re

def cleaning(string):
    string = string.replace('\n', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [4]:
import json
import tensorflow as tf
import itertools

filename = 'summary.tsv'
with tf.io.gfile.GFile(filename, 'w') as outfile:
    for file in files:
        with open(file) as fopen:
            data = json.load(fopen)
        
        merged = list(itertools.chain(*data))
        
        print(file, len(merged))
        
        for i in range(len(merged)):
            l, r = cleaning(merged[i][0]), cleaning(merged[i][1])
            outfile.write('%s\t%s\n' % (l, r))

results-semi-2.json 35824
results-semi-0.json 35824
results-semi-1.json 35824


In [5]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin
import sentencepiece as spm
from glob import glob
import os

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.t5.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

True

In [6]:
def ringkasan_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        ['summary.tsv']
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds

def ringkasan_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['ringkasan: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('ringkasan_dataset')
t5.data.TaskRegistry.add(
    'ringkasan_dataset',
    dataset_fn = ringkasan_dataset,
    splits = ['train'],
    text_preprocessor = [ringkasan_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

  "get_sentencepiece_model_path is deprecated. Please pass the mixture or "


In [7]:
from tqdm import tqdm

nq_task = t5.data.TaskRegistry.get("ringkasan_dataset")
ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 768, "targets": 768})

batch_size, index, part = 50000, 0, 0
fopen = open(f'ringkasan-{part}.parse', 'w')
for ex in tqdm(tfds.as_numpy(ds)):
    i = sp.DecodeIds(ex['inputs'].tolist())
    t = sp.DecodeIds(ex['targets'].tolist())
    text = f'{i} [[EENNDD]] {t}\n'
    fopen.write(text)
    
    if index == batch_size:
        fopen.close()
        part += 1
        index = 0
        fopen = open(f'ringkasan-{part}.parse', 'w')
    
    index += 1
    
fopen.close()

107472it [02:33, 700.92it/s]
