In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5

In [2]:
import gin

gin.parse_config_file('gs://mesolitica-tpu-general/t5-data/pretrained_models_base_operative_config.gin')

In [3]:
vocab = 'gs://mesolitica-tpu-general/t5-data/sp10m.cased.t5.model'

In [4]:
def dumping_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-tpu-general/t5-data/cleaned-news.txt.tsv',
            'gs://mesolitica-tpu-general/t5-data/dumping-parliament.txt.tsv',
            'gs://mesolitica-tpu-general/t5-data/filtered-dumping-academia.txt.tsv',
            'gs://mesolitica-tpu-general/t5-data/filtered-dumping-cleaned-common-crawl.txt.tsv',
            'gs://mesolitica-tpu-general/t5-data/filtered-dumping-wiki.txt.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['title', 'text'], ex)))
    return ds


t5.data.TaskRegistry.remove('dumping_dataset')
t5.data.TaskRegistry.add(
    'dumping_dataset',
    dataset_fn = dumping_dataset,
    splits = ['train'],
    text_preprocessor = functools.partial(
        t5.data.preprocessors.rekey,
        key_map = {'inputs': None, 'targets': 'text'},
    ),
    token_preprocessor = t5.data.preprocessors.unsupervised,
    sentencepiece_model_path = vocab,
    metric_fns = [],
)

  "get_sentencepiece_model_path is deprecated. Please pass the mixture or "


In [5]:
nq_task = t5.data.TaskRegistry.get('dumping_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 512, 'targets': 512}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 
{'inputs': array([   17,    96,     3,  2724,    14, 14760, 24411,   298,   944,
       32099,    14,    15, 32098,  6194, 15328,  3654,   941,    56,
        3204,   131,   150,  2290,     3,    99,  3260,  2024,   103,
        1825,   598,  2372,   171,   354, 32097,  1108,    29,    85,
        2511,    28,   567, 10372,   558,   171, 19535,   463,    25,
        3992,  4747,  1066,   816,    16, 10186, 10170,  3209,  1066,
         816,     3,  1771,  5729,    17,    42,   367,   333,  1775,
          20,  1913,  2591,   263,    76,   143,   123,    47,   543,
         462, 10288,  8262,  2168,   116,  9404,  2291, 12153,  1891,
         891,    23,   592,   548,   578,   148,     3, 32096, 10186,
       10170,  3209,  1066,   816,    24,    18,   454, 11753,    54,
     

In [7]:
def question_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-tpu-general/t5-data/qa.tsv',
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def question_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['soalan: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('question_dataset')
t5.data.TaskRegistry.add(
    'question_dataset',
    dataset_fn = question_dataset,
    splits = ['train'],
    text_preprocessor = [question_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [8]:
nq_task = t5.data.TaskRegistry.get('question_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 32}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

{'inputs_plaintext': b'soalan: bilakah patrika amrit bazar bermula di bangalore?', 'inputs': array([ 2105,    50, 11845, 12130,  4460,    13,  2702,  4392, 10345,
         348,    18,    15,  2852,    13, 24868,   275,     1]), 'targets_plaintext': b'20 Februari 1868.', 'targets': array([  427,   846, 25491,     3,     1])}
{'inputs_plaintext': b'soalan: lukisan minyak telah menjadi medium pilihan pelukis sejak dulu?', 'inputs': array([ 2105,    50,  5727,   658,    33,    56,  6191,   479, 10136,
         246,  1953,   275,     1]), 'targets_plaintext': b'akhir abad ke-15', 'targets': array([495, 994,  30,   7, 871,   1])}
{'inputs_plaintext': b'soalan: apakah tahap pemerintahan ketiga di australia?', 'inputs': array([ 2105,    50,  1296,   759,  1167,   773,    18, 11650,  7052,
         440,    13,   275,     1]), 'targets_plaintext': b'Kerajaan tempatan', 'targets': array([ 87, 401,   1])}
{'inputs_plaintext': b'soalan: bilakah penglibatan kita dalam vietnam berakhir?', 'inputs': a

In [9]:
def pair_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-tpu-general/t5-data/cleaned-news.txt-pair.tsv',
            'gs://mesolitica-tpu-general/t5-data/dumping-parliament.txt-pair.tsv',
            'gs://mesolitica-tpu-general/t5-data/filtered-dumping-academia.txt-pair.tsv',
            'gs://mesolitica-tpu-general/t5-data/filtered-dumping-cleaned-common-crawl.txt-pair.tsv',
            'gs://mesolitica-tpu-general/t5-data/filtered-dumping-wiki.txt-pair.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['text'], ex)))
    return ds


t5.data.TaskRegistry.remove('pair_dataset')
t5.data.TaskRegistry.add(
    'pair_dataset',
    dataset_fn = pair_dataset,
    splits = ['train'],
    text_preprocessor = [prep.next_sentence_prediction],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [10]:
nq_task = t5.data.TaskRegistry.get('pair_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 32}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

      lambda x: x[text_key], num_parallel_calls=tf.data.experimental.AUTOTUNE)

If this is a lambda function, the error may be avoided by creating the lambda in a standalone statement.
      lambda x: x[text_key], num_parallel_calls=tf.data.experimental.AUTOTUNE)

If this is a lambda function, the error may be avoided by creating the lambda in a standalone statement.
{'inputs_plaintext': b'nsp: 51 bilion (US$16. 62 billion) pada 2017.', 'inputs': array([2532, 7175,   50, 3660,  256,   15,    4, 4987, 7205,  323,    3,
       5880, 4687,    5,   23,  568,    3,    1]), 'targets_plaintext': b'next', 'targets': array([7426,    1])}
{'inputs_plaintext': b'nsp: "Jadi bagi SPR, PKR yang bertanding. Sazali memenangi kejuaraan Mr Universe pada 2000, 2004, 2006, 2007, 2009, 2010, 2011, 2012, 2013 dan 2014 serta lapan kali gelaran Mr Asia bermula 1996, 1998, 1999, 2000, 2004, 2009, 2010, 2012 dan 2014.', 'inputs': array([ 2532,  7175,    50,    15,     6,  3733,    43,  3238,    14,
        1017

In [11]:
def news_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-tpu-general/t5-data/newstitle.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def news_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['tajuk: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('news_dataset')
t5.data.TaskRegistry.add(
    'news_dataset',
    dataset_fn = news_dataset,
    splits = ['train'],
    text_preprocessor = [news_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [12]:
nq_task = t5.data.TaskRegistry.get('news_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 1024, 'targets': 1024}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

{'inputs_plaintext': b'tajuk: Presiden Joko Widodo meluncurkan Masterplan Ekonomi Syariah Indonesia (MEKSI) 2019-2024 di Jakarta, yang menjadi panduan Indonesia dalam pengembangan ekonomi Syariah. Menurut MEKSI, ekonomi Syariah global diperkirakan akan mencapai AS$3 triliun pada 2023. Mengibaratkan angka itu sebagai sebuah kue, Jokowi berharap Indonesia bisa berebut untuk turut merasakan nikmatnya. Namun, ada banyak tantangan yang harus dihadapi untuk mewujudkan harapan Presiden itu. Kepala Eksekutif Pengawas Pasar Modal dari Otoritas Jasa Keuangan (OJK), Hoesen, mengatakan MEKSI adalah sebuah pedoman untuk mencapai target. Seluruh pihak terkait di sektor industri keuangan syariah memiliki 3 tantangan yang harus diselesaikan untuk meraih target itu. \xe2\x80\x9cYang pertama adalah penguatan lembaga keuangan Syariah, antara lain melalui peningkatan modal usaha dan sumber daya manusia, penguatan informasi, variasi produk, pemanfaatan teknologi dalam proses bisnis, serta penerapan tata ke

In [13]:
def stemming_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        ['gs://mesolitica-tpu-general/t5-data/stemming.tsv']
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def stemming_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['punca: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('stemming_dataset')
t5.data.TaskRegistry.add(
    'stemming_dataset',
    dataset_fn = stemming_dataset,
    splits = ['train'],
    text_preprocessor = [stemming_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [14]:
nq_task = t5.data.TaskRegistry.get('stemming_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 32}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

{'inputs_plaintext': b'punca: Berita - berita tentang buah ini siar di dalam surat khabar sekitar tahun angan .', 'inputs': array([1761,   50, 4470,   15,    7, 1317,  406,  206,   20,   15, 9657,
         18,   19, 1148, 4555,  350,   31,   15, 3384,   15,    3,    1]), 'targets_plaintext': b'Berita-berita tentang buah ini tersiar di dalam surat khabar sekitar tahun 1987 .', 'targets': array([ 4470,     7, 15285,   406,   206,    20, 28914,    18,    19,
        1148,  4555,   350,    31,  4192,    15,     3,     1])}
{'inputs_plaintext': b'punca: Di Grand Prix China , Hamilton menang di hadap Felipe Massa dan Kimi Raikkonen , dan punya lebih angan mata dalam Juara Dunia ke perlumbaan tutup tirai musim ini .', 'inputs': array([ 1761,    50,   104,  2897,  4672,   303,    27, 10959,  1298,
          18,    15, 11684, 28632, 11762,    16,  2052,    52, 30360,
          27,    16,  3453,    51,    15,  3384,   540,    19,  3144,
         653,    30,  3131,  7174, 18023,   556,    20,    

In [15]:
def synonym_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        ['gs://mesolitica-tpu-general/t5-data/synonyms.tsv']
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def synonym_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['sinonim: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('synonym_dataset')
t5.data.TaskRegistry.add(
    'synonym_dataset',
    dataset_fn = synonym_dataset,
    splits = ['train'],
    text_preprocessor = [synonym_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [16]:
nq_task = t5.data.TaskRegistry.get('synonym_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 256}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

{'inputs_plaintext': b'sinonim: persoalan alam sekitar di indonesia hal penerbitan latar belakang di indonesia berhubungan dengan kesesakan penyewa yang panas dan industri yang pesat dan mengeluarkan frekuensi diberi kemasyhuran yang kelebihannya suram disebabkan oleh lapis kefakiran yang nyaring sebaliknya pada keterampilan kabinet yang lengang berkemampuan pengeluaran penerbitan dipunyai penebangan belukar secara subur besaran kebanyakan belum sahih di mengapit jemput jemput dan api api membawa asbut di titik barat indonesia malaysia dan singapura ugutan asin mata air lautan dan kesukaran alam semesta sekeliling yang serupa dengan urbanisasi pesat dan penubuhan cermat mempunyai noda kentut penumpuan larikan menyeberang pengelolaan kedunguan dan servis cecair dan perairan kumbahan yang dapatkan dibergantung kaki kaki', 'inputs': array([12069,    50,  2424,   841,   350,    18,    15, 27478,   539,
        4354,  2918,  1037,    18,    15, 27478, 10296,    22,  5422,
       11416,    1

In [19]:
def similarity_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-tpu-general/t5-data/quora.tsv',
            'gs://mesolitica-tpu-general/t5-data/snli.tsv',
            'gs://mesolitica-tpu-general/t5-data/mnli.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def similarity_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': ex['question'],
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('similarity_dataset')
t5.data.TaskRegistry.add(
    'similarity_dataset',
    dataset_fn = similarity_dataset,
    splits = ['train'],
    text_preprocessor = [similarity_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [20]:
nq_task = t5.data.TaskRegistry.get('similarity_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 32}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

{'inputs_plaintext': b'soalan1: Apakah kelayakan untuk menjadi ejen FBI atau CIA? soalan2: Apa yang diperlukan untuk menjadi ejen FBI?', 'inputs': array([ 2105,   112,    50,  3115,  2016,    21,    56,  5752, 23088,
          46, 19277,   275,  2105,   143,    50,  1819,    17,  1604,
          21,    56,  5752, 23088,   275,     1]), 'targets_plaintext': b'tak sama', 'targets': array([379,  96,   1])}
{'inputs_plaintext': b'soalan1: Bagaimana saya dapat meningkatkan ketinggian saya selepas 21 juga? soalan2: Bolehkah ketinggian meningkat selepas 25?', 'inputs': array([2105,  112,   50, 5268,  113,   91,  340, 1271,  113,   98,  836,
         42,  275, 2105,  143,   50, 6751, 1644, 1271,  748,   98,  818,
        275,    1]), 'targets_plaintext': b'sama', 'targets': array([96,  1])}
{'inputs_plaintext': b'soalan1: Bagaimanakah saya tahu jika pasangan saya adalah jiwa saya? soalan2: Bagaimana saya boleh tahu bahawa dia adalah jiwa saya?', 'inputs': array([ 2105,   112,    50, 24282,   1