In [2]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/train-ms-en.tar.gz
# !tar -zxf train-ms-en.tar.gz

In [4]:
with open('train/left.txt') as fopen:
    left = fopen.read().split('\n')
    
with open('train/right.txt') as fopen:
    right = fopen.read().split('\n')

In [5]:
len(left), len(right)

(3712555, 3712555)

In [6]:
left = left[:500000]
right = right[:500000]

In [7]:
from tqdm import tqdm
import re

def cleaning(string):
    string = string.replace('\n', ' ').replace('\t', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [8]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin
import sentencepiece as spm
from glob import glob
import os

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.ms-en.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

True

In [9]:
with tf.io.gfile.GFile('ms-en.tsv', "w") as outfile:
    for i in range(len(left)):
        l = cleaning(left[i])
        r = cleaning(right[i])
        outfile.write("%s\t%s\n" % (l, r))

In [10]:
def ms_en_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'ms-en.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds

def ms_en_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['terjemah Melayu ke Inggeris: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )

In [12]:
t5.data.TaskRegistry.remove('ms_en_dataset')
t5.data.TaskRegistry.add(
    'ms_en_dataset',
    dataset_fn = ms_en_dataset,
    splits = ['train'],
    text_preprocessor = [ms_en_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)


In [13]:
nq_task = t5.data.TaskRegistry.get("ms_en_dataset")
ds = nq_task.get_dataset(split='ms-en.tsv', sequence_length={"inputs": 1024, "targets": 1024})
r = tfds.as_numpy(ds)

In [14]:
next(r)

{'inputs_plaintext': b'terjemah Melayu ke Inggeris: "Seksyen ini masing-masing menetapkan seseorang mesti mengucapkan dua kalimah syahadah dalam bahasa Arab secara jelas dan dengan kerelaan sendiri, manakala kanak-kanak yang belum mencapai umur 18 tahun boleh memeluk Islam jika mendapat kebenaran bertulis daripada ibu bapa atau penjaga.',
 'inputs': array([   13, 26087,  1550,    55,  2040,    31,    13,     6,    75,
         7918,  9132,    34,  2713,     7,  1915,  3635,  1090,  1286,
         5505,   192,   349,  4357,    13,    16,  8935, 20639,    36,
         1246,  1704,   156,   651,    22,    28,    55,  5701,  6454,
          280,    14,  3435,  1486,     7,   817,    17,   742,  1155,
         5741,   375,    53,   150, 11860,   558,   273,   310,  4866,
        13621,   109,   604,  1593,    87,  7843,     3,     1]),
 'targets_plaintext': b'"This section specifies that a person must pronounce two words of shahad in Arabic clearly and at his own will, while children under 

In [None]:
!rm -rf train