In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5

In [2]:
vocab = 'gs://mesolitica-general/t5-vocab/sp10m.cased.t5.model'

In [3]:
def dumping_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-general/t5-data/dumping-iium.tsv',
            'gs://mesolitica-general/t5-data/dumping-news.tsv',
            'gs://mesolitica-general/t5-data/dumping-parliament.tsv',
            'gs://mesolitica-general/t5-data/dumping-pdf.tsv',
            'gs://mesolitica-general/t5-data/dumping-watpadd.tsv',
            'gs://mesolitica-general/t5-data/dumping-wiki.tsv',
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['title', 'text'], ex)))
    return ds


t5.data.TaskRegistry.remove('dumping_dataset')
t5.data.TaskRegistry.add(
    'dumping_dataset',
    dataset_fn = dumping_dataset,
    splits = ['train'],
    text_preprocessor = functools.partial(
        t5.data.preprocessors.rekey,
        key_map = {'inputs': None, 'targets': 'text'},
    ),
    token_preprocessor = t5.data.preprocessors.unsupervised,
    sentencepiece_model_path = vocab,
    metric_fns = [],
)

In [4]:
nq_task = t5.data.TaskRegistry.get('dumping_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 32}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

{'inputs_plaintext': b'', 'inputs': array([1]), 'targets_plaintext': b'Ya bangsa itu lagi.', 'targets': array([1958, 1057,   24,   95,    3,    1])}
{'inputs_plaintext': b'', 'inputs': array([1]), 'targets_plaintext': b'Kalau aku balik kerja lambat dek kerana timbunan kerja, dia sanggup tunggu aku dan balik kerja sekali.', 'targets': array([ 2002,  3359,  1069,   290,  9313, 10523,    69, 26169,   290,
          14,    18,    13,  3709,  6839,  3359,    16,  1069,   290,
         383,     3,     1])}
{'inputs_plaintext': b'', 'inputs': array([1]), 'targets_plaintext': b'Konflik bermula bila mak mula terasa bahang sebab aku da mula tak bantu dalam belanja rumahtangga.', 'targets': array([12510,   348,  1889,  8468,   407, 13543,   547,   562,   873,
        3359,  3476,   407,   379,  4242,    19,  5950, 18575,     3,
           1])}
{'inputs_plaintext': b'', 'inputs': array([1]), 'targets_plaintext': b'Bila dia ada, ada sahaja yang tak kena sebab saya rasakan dia tak suka saya dan sala

In [5]:
def question_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-general/t5-data/qa-train.tsv',
            'gs://mesolitica-general/t5-data/qa-validation.tsv',
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def question_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['soalan: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('question_dataset')
t5.data.TaskRegistry.add(
    'question_dataset',
    dataset_fn = question_dataset,
    splits = ['train'],
    text_preprocessor = [question_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [6]:
nq_task = t5.data.TaskRegistry.get('question_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 32}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

{'inputs_plaintext': b'soalan: siapa yang menerjemahkan Alkitab King James ke Bahasa Inggeris?', 'inputs': array([ 2105,    50,  2746,    17,  5220,  2406,  2675,    54, 19079,
        3400,  3574,    30,   741,   719,   275,     1]), 'targets_plaintext': b'47 cendekiawan, yang semuanya adalah anggota Gereja England', 'targets': array([ 3658, 13521,    14,    17,  3434,    35,   381,  4874,  2292,
           1])}
{'inputs_plaintext': b'soalan: siapakah gabenor puerto rico sekarang?', 'inputs': array([ 2105,    50, 16118, 10511,  6697,   254,   584,  9667,  1618,
         769,   275,     1]), 'targets_plaintext': b'Alejandro Javier Garc\xc3\xada Padilla', 'targets': array([  209,  7029,    13, 16069, 31915,  4565,   823,     2,    13,
        9314,  6471,     1])}
{'inputs_plaintext': b'soalan: di mana pertempuran pertama revolusi texas berlaku?', 'inputs': array([2105,   50,   18,  119, 2815,  131, 5932, 2478, 1049,   13,   25,
        216,  275,    1]), 'targets_plaintext': b'Gonzales

In [17]:
def pair_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-general/t5-data/dumping-iium-pair.tsv',
            'gs://mesolitica-general/t5-data/dumping-news-pair.tsv',
            'gs://mesolitica-general/t5-data/dumping-parliament-pair.tsv',
            'gs://mesolitica-general/t5-data/dumping-pdf-pair.tsv',
            'gs://mesolitica-general/t5-data/dumping-watpadd-pair.tsv',
            'gs://mesolitica-general/t5-data/dumping-wiki-pair.tsv',
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['text'], ex)))
    return ds


t5.data.TaskRegistry.remove('pair_dataset')
t5.data.TaskRegistry.add(
    'pair_dataset',
    dataset_fn = pair_dataset,
    splits = ['train'],
    text_preprocessor = [prep.next_sentence_prediction],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [18]:
nq_task = t5.data.TaskRegistry.get('pair_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 32}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

      lambda x: x[text_key], num_parallel_calls=tf.data.experimental.AUTOTUNE)

If this is a lambda function, the error may be avoided by creating the lambda in a standalone statement.
      lambda x: x[text_key], num_parallel_calls=tf.data.experimental.AUTOTUNE)

If this is a lambda function, the error may be avoided by creating the lambda in a standalone statement.
{'inputs_plaintext': b'nsp: Sangat menyedihkan bila saya dah siap masak berjenis2 masakan, nasi pulak masak lebih banyak dari biasa.. Tapi progress sekarang saya still seorang student..', 'inputs': array([ 2532,  7175,    50, 17205, 12863,  1889,   113,  4362,  1437,
        5008,   181,  8129,   143,  3909,    14,  3705,   218,   139,
        5008,    51,   127,    37,   634,     3,     3,  2987, 17583,
         769,   113,  5205,   105, 11559,     3,     3,     1]), 'targets_plaintext': b'not_next', 'targets': array([ 699, 3835,  144, 2990,   81,    1])}
{'inputs_plaintext': b'nsp: Aku present pakai Power Point semua buk

In [31]:
def news_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-general/t5-data/news-title.tsv',
            'gs://mesolitica-general/t5-data/news-title2.tsv',
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def news_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['tajuk: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('news_dataset')
t5.data.TaskRegistry.add(
    'news_dataset',
    dataset_fn = news_dataset,
    splits = ['train'],
    text_preprocessor = [news_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [32]:
nq_task = t5.data.TaskRegistry.get('news_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 1024, 'targets': 1024}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

{'inputs_plaintext': b'tajuk: BATU PAHAT: Dewan Rakyat sehingga kini belum menerima notis atau surat daripada Ahli Parlimen yang mengumumkan untuk menjadi bebas atau lompat parti. Yang Dipertuanya, Datuk Mohamad Ariff Md Yusof, berkata beliau hanya tahu ada beberapa Ahli Parlimen yang mengisytiharkan sebagai Ahli Parlimen Bebas dan keluar parti tertentu hanya menerusi media. \xe2\x80\x9cSetakat ini, saya tiada senarai Ahli Parlimen yang isytihar bebas mahu pun melompat ke parti lain kerana belum menerima surat, notis atau dokumen bertulis daripada mereka seperti dipaparkan media. \xe2\x80\x9cParlimen atau saya hanya dapat mengesahkan perkara ini apabila kami mendapat dokumen bertulis daripada mereka (Ahli Parlimen keluar parti). \xe2\x80\x9cMungkin masih awal kerana sesi Dewan Rakyat tahun depan pun hanya akan bermula pada Mac depan. Sekiranya benar, Setiausaha Parlimen akan menentukan semula tempat duduk mereka (Ahli Parlimen keluar parti) di dalam dewan \xe2\x80\x9d katanya. Beliau b

In [24]:
def stemming_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        ['gs://mesolitica-general/t5-data/stemming.tsv']
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def stemming_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['punca: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('stemming_dataset')
t5.data.TaskRegistry.add(
    'stemming_dataset',
    dataset_fn = stemming_dataset,
    splits = ['train'],
    text_preprocessor = [stemming_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [25]:
nq_task = t5.data.TaskRegistry.get('stemming_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 32}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

{'inputs_plaintext': b'punca: Sekolah Bangsa Lepar Sekolah Bangsa Lepar atau nama ringkas " SK Lepar " , rupa buah Sekolah Bangsa yang letak di Rpsb Batu angan Lepar Pan .', 'inputs': array([1761,   50,  133, 2337,  917, 2453,  133, 2337,  917, 2453,   46,
        195, 4140,   15,    6, 3758,  917, 2453,   15,    6,   27, 6523,
        206,  133, 2337,   17, 4201,   18, 1994,   25,  443, 1028,   15,
       3384,  917, 2453, 2037,   15,    3,    1]), 'targets_plaintext': b'Sekolah Kebangsaan Lepar Sekolah Kebangsaan Lepar atau nama ringkasnya "SK Lepar", merupakan sebuah Sekolah Kebangsaan yang terletak di Rpsb Batu 8 Lepar Pekan.', 'targets': array([ 133,  191,  917, 2453,  133,  191,  917, 2453,   46,  195, 1183,
         15,    6, 1760,  917, 2453,    6,   14,   34,   39,  133,  191,
         17,   49,   18, 1994,   25,  443, 1028,  410,  917,    1])}
{'inputs_plaintext': b'punca: " Jump Next " angan', 'inputs': array([ 1761,    50,    15,     6, 18374, 21637,    15,     6,    15,
  

In [33]:
def synonym_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        ['gs://mesolitica-general/t5-data/synonyms.tsv']
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def synonym_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['sinonim: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('synonym_dataset')
t5.data.TaskRegistry.add(
    'synonym_dataset',
    dataset_fn = synonym_dataset,
    splits = ['train'],
    text_preprocessor = [synonym_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [34]:
nq_task = t5.data.TaskRegistry.get('synonym_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 32}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

{'inputs_plaintext': b'sinonim: ketua gajah zat ehwal zon penubuhannya pedesaan pertumbuhan andai timbalan Ketua Menteri l bahagian ehwal agama Islam kemajuan tugas tugas perhubungan biarawati timbalan pemimpin uskup ll skim ekonomi negara pengetahuan penubuhan harta insan sains teknologi kemerosotan gelanggang kakak keluarganya keterangkuman gender sentimen selain Islam kerajaan isi perumahan pelan benteng desa semangat kediaman penyanyang ranah lebih kurang rupa perkebunan kedai asas tani yayasan keluar benteng perburuhan raya utiliti tebatan kelimpahan firma dalam pentadbiran internasional acara ehwal pengguna pembinaannya keusahawanan syarikat kawalan pusaka budaya kekacakan pemula dan Sukan unit ini diletakan di kaki persediaan kehormatan setiausaha komander keadaannya pulau pinang dan berkhidmat untuk merekodkan dan mereka segala berjalan yang memberitahu dengan keributan konvensyen kerajaan negara', 'inputs': array([12069,    50,  1103,  5304, 11426,  5866,  3578,  7770, 30855,


In [35]:
def quora_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-general/t5-data/quora-0-100k.tsv',
            'gs://mesolitica-general/t5-data/quora-100k-200k.tsv',
            'gs://mesolitica-general/t5-data/quora-200k-300k.tsv',
            'gs://mesolitica-general/t5-data/quora-400k-500k.tsv',
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def quora_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': ex['question'],
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('quora_dataset')
t5.data.TaskRegistry.add(
    'quora_dataset',
    dataset_fn = quora_dataset,
    splits = ['train'],
    text_preprocessor = [quora_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [36]:
nq_task = t5.data.TaskRegistry.get('quora_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 32}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

{'inputs_plaintext': b'soalan1: Apa yang sepatutnya menonton rancangan TV sebelum anda mati? soalan2: Adakah ada yang mesti menonton rancangan TV?', 'inputs': array([2105,  112,   50, 1819,   17, 1593, 6675, 1269, 1414,  137,   63,
         13, 1218,  275, 2105,  143,   50, 3450,   64,   17, 1732, 6675,
       1269, 1414,  275,    1]), 'targets_plaintext': b'sama', 'targets': array([96,  1])}
{'inputs_plaintext': b'soalan1: Bagaimanakah kebanyakan orang mati? soalan2: Bagaimana orang mati?', 'inputs': array([ 2105,   112,    50, 24282,   989,    53,  1218,   275,  2105,
         143,    50,  5268,    53,  1218,   275,     1]), 'targets_plaintext': b'tak sama', 'targets': array([379,  96,   1])}
{'inputs_plaintext': b'soalan1: Apa maksudnya jika anjing muntah buih putih? soalan2: Mengapa puppy saya muntah busa putih?', 'inputs': array([ 2105,   112,    50,  1819, 18172,   268,  1884, 18391,  5687,
        2943,  1658,   275,  2105,   143,    50,  9675,  6697,  4739,
         157,   113,

In [41]:
def snli_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-general/t5-data/snli-part1.tsv',
            'gs://mesolitica-general/t5-data/snli-part2.tsv',
            'gs://mesolitica-general/t5-data/snli-part3.tsv',
            'gs://mesolitica-general/t5-data/snli-part4.tsv',
            'gs://mesolitica-general/t5-data/snli-part5.tsv',
            'gs://mesolitica-general/t5-data/snli-part6.tsv',
            'gs://mesolitica-general/t5-data/snli-pary7.tsv',
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def snli_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': ex['question'],
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('snli_dataset')
t5.data.TaskRegistry.add(
    'snli_dataset',
    dataset_fn = snli_dataset,
    splits = ['train'],
    text_preprocessor = [snli_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [42]:
nq_task = t5.data.TaskRegistry.get('snli_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 32}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

{'inputs_plaintext': b'ayat1: Orang berada di pentas panggung. ayat2: orang berada di atas pentas', 'inputs': array([1774,  112,   50,  751,  351,   18, 3555, 7934,    3, 1774,  143,
         50,   53,  351,   18,  107, 3555,    1]), 'targets_plaintext': b'berkait', 'targets': array([3754,    1])}
{'inputs_plaintext': b'ayat1: Seorang budak kecil telah masuk ke kabinet dan mendapat tepung dan crisco seluruh dirinya. ayat2: Anak lelaki itu sedang kacau.', 'inputs': array([ 1774,   112,    50,  1952,  7719,   345,    33,   612,    30,
        4945,    16,   231,  7640,    16,    15, 10092,  6603,   291,
        1476,     3,  1774,   143,    50,  2026,   215,    24,   352,
       19112,     3,     1]), 'targets_plaintext': b'berkait', 'targets': array([3754,    1])}
{'inputs_plaintext': b'ayat1: Seorang gadis kecil mengikuti dua lelaki dengan payung ke bawah jalan. ayat2: Gadis kecil sedang menunggang kuda.', 'inputs': array([ 1774,   112,    50,  1952,  3903,   345,  1841,   102,   215,
