In [1]:
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-tpu.json'

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5

In [3]:
import gin

gin.parse_config_file('gs://mesolitica-tpu-general/t5-data/pretrained_models_base_operative_config.gin')

In [4]:
vocab = 'gs://mesolitica-tpu-general/t5-data-v2/sp10m.cased.ms-en.model'

In [6]:
def dumping_dataset(split, shuffle_files = False):
    del shuffle_files
    files = [
        'gs://mesolitica-tpu-general/t5-data-v2/dumping-news.txt.tsv',
        'gs://mesolitica-tpu-general/t5-data-v2/dumping-parliament.txt.tsv',
        'gs://mesolitica-tpu-general/t5-data-v2/filtered-dumping-academia.txt.tsv',
        'gs://mesolitica-tpu-general/t5-data-v2/filtered-dumping-wiki.txt.tsv'
    ]
    files.extend(tf.io.gfile.glob('gs://mesolitica-tpu-general/t5-data-v2/00.jsonl-*.translated.txt.tsv'))
    ds = tf.data.TextLineDataset(files)

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['title', 'text'], ex)))
    return ds


t5.data.TaskRegistry.remove('dumping_dataset')
t5.data.TaskRegistry.add(
    'dumping_dataset',
    dataset_fn = dumping_dataset,
    splits = ['train'],
    text_preprocessor = functools.partial(
        t5.data.preprocessors.rekey,
        key_map = {'inputs': None, 'targets': 'text'},
    ),
    token_preprocessor = t5.data.preprocessors.unsupervised,
    sentencepiece_model_path = vocab,
    metric_fns = [],
)

In [7]:
nq_task = t5.data.TaskRegistry.get('dumping_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 512, 'targets': 512}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 
{'inputs': array([22249,  8959, 27761,    71,  2111,   845,   797,  1708, 32099,
       14097,   983,    91,    14, 18986, 18814,  2499,    14,   876,
         261,  1546,   478,  7519,  2287,     3,  4913,  5780, 32098,
        6185,  8184,    24, 28017,   848,   567, 11846,  1616,     7,
       17333,  6185, 21172,    71, 12643,    14,  7652,   290, 32097,
        1214,   530,    13,     6,   617, 11033,   372, 14307, 32096,
         735,  2817,  3234,  3602,  3176,  2065,    71,   529,   140,
         845,    17,    62,   438,  1602, 21109, 31655,   626,  5786,
         818,    14,    25,   845,   342,   530, 12616,   858, 10265,
           7,   756,    28,  3247, 13273,   103,   372, 14307,  8959,
         434,    72,  7019,    14,     6,    98,  1475,  8200,     3,
     

In [8]:
def question_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-tpu-general/t5-data-v2/qa.tsv',
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def question_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['soalan: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('question_dataset')
t5.data.TaskRegistry.add(
    'question_dataset',
    dataset_fn = question_dataset,
    splits = ['train'],
    text_preprocessor = [question_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [9]:
nq_task = t5.data.TaskRegistry.get('question_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 32}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

{'inputs_plaintext': b'soalan: siapa yang bermain chris pratt sebagai penjaga galaksi?', 'inputs': array([ 2762,    31,  1652,    17,   964,    13,   856,  3964,  1205,
       18288,    85,  7843, 23226,    77,     1]), 'targets_plaintext': b'Bintang-Tuan', 'targets': array([6217,    7,  306, 2745,    1])}
{'inputs_plaintext': b'soalan: apakah musim 2 perkara orang asing?', 'inputs': array([2762,   31, 1516,  451,  211,  355,   73, 1723,   77,    1]), 'targets_plaintext': b'percubaan watak untuk kembali normal dan akibat yang berlanjutan dari musim pertama', 'targets': array([3748, 2426,   25,  393, 3204,   22, 2157,   17, 8980,   42,  451,
        213,    1])}
{'inputs_plaintext': b'soalan: berapakah panjang garis pantai india?', 'inputs': array([ 2762,    31,    50, 30684,  1110,  2521,  2439,    23,  3427,
          77,     1]), 'targets_plaintext': b'7.516.6 km (4.671 mi)', 'targets': array([ 548,    3,  246, 1875,    3,  396, 4281,   13,    4,  343,    3,
       4034,  201,   13, 

In [11]:
def pair_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(tf.io.gfile.glob('gs://mesolitica-tpu-general/t5-data-v2/*pair.tsv'))

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['text'], ex)))
    return ds


t5.data.TaskRegistry.remove('pair_dataset')
t5.data.TaskRegistry.add(
    'pair_dataset',
    dataset_fn = pair_dataset,
    splits = ['train'],
    text_preprocessor = [prep.next_sentence_prediction],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [12]:
nq_task = t5.data.TaskRegistry.get('pair_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 32}
)
for ex in tfds.as_numpy(ds.take(5)):
    print(ex)

      lambda x: x[text_key], num_parallel_calls=tf.data.experimental.AUTOTUNE)

If this is a lambda function, the error may be avoided by creating the lambda in a standalone statement.
      lambda x: x[text_key], num_parallel_calls=tf.data.experimental.AUTOTUNE)

If this is a lambda function, the error may be avoided by creating the lambda in a standalone statement.
{'inputs_plaintext': b'nsp: Salah satu pelanggan pertamanya ialah Google, yang telah menugaskan sekitar 300 unit perumahan tenaga kerja modular dari Factory_OS ". Kami mempunyai kos ini yang tidak siuman, "kata Holliday, merujuk kepada sewa di luar kawalan dan harga rumah di rantau ini.', 'inputs': array([   13,   152,  4615,    31,  2689,   100,  1128,  3730,   358,
         912,    14,    17,    62, 28421,   462,  3706,  1642,  3630,
        1340,   802, 20292,   382,    42, 23908,  2902,  9291,    13,
           6,     3,   709,   118,  1031,    34,    17,    30,    13,
          16,  4040,    47,    14,    13,     6,  

In [13]:
def news_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-tpu-general/t5-data-v2/newstitle.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def news_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['tajuk: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('news_dataset')
t5.data.TaskRegistry.add(
    'news_dataset',
    dataset_fn = news_dataset,
    splits = ['train'],
    text_preprocessor = [news_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [14]:
nq_task = t5.data.TaskRegistry.get('news_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 1024, 'targets': 1024}
)
for ex in tfds.as_numpy(ds.take(1)):
    print(ex)

{'inputs_plaintext': b'tajuk: Telkomsel berkomitmen untuk terus bergerak maju memperkuat perannya dalam upaya bersama membantu masyarakat dan Pemerintah RI menghadapi pandemi COVID-19 di Indonesia. Tekad Telkomsel untuk berada di barisan terdepan selama penanganan COVID-19 sejalan dengan semangat Telkomsel yang tengah memasuki usia ke-25 tahun, terutama dalam menghadirkan inisiatif pada aktivitas corporate social responsibility (CSR) yang telah dilakukan sejak awal tahun. Kini, upaya tersebut semakin diperkuat melalui pemberian donasi berupa alat ventilator serta lebih dari 100.000 Alat Pelindung Diri (APD) yang secara berkala diserahkan langsung kepada sejumlah Rumah Sakit Darurat dan Rumah Sakit Rujukan penanganan COVID-19 yang ada di seluruh Indonesia. Direktur Utama Telkomsel Setyanto Hantoro mengatakan, \xe2\x80\x9cTelkomsel secara konsisten terus menunjukkan kepeduliannya kepada seluruh masyarakat Indonesia dalam menghadapi cobaan pandemi COVID-19 ini. Memasuki usia perusahaan ke

In [15]:
def summarization_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-tpu-general/t5-data-v2/summarization.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def summarization_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['ringkasan: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('summarization_dataset')
t5.data.TaskRegistry.add(
    'summarization_dataset',
    dataset_fn = summarization_dataset,
    splits = ['train'],
    text_preprocessor = [summarization_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [16]:
nq_task = t5.data.TaskRegistry.get('summarization_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 1024, 'targets': 1024}
)
for ex in tfds.as_numpy(ds.take(1)):
    print(ex)

{'inputs_plaintext': b"ringkasan: Oleh . Claire Ellicott. PENERBITAN: . 17:32 EST, 17 Mac 2013. |. DIKEMASKINI: . 06:12 EST, 18 Mac 2013. Dalam masa-masa sukar, sesetengah daripada kita akan pergi ke mana-mana panjang untuk mendapatkan makan tengah hari percuma. Dan itu termasuk Hazel si tupai merah yang dilihat di sini mengambil laluan udara untuk mengumpul beberapa hazelnut di taman negara yang dia panggil pulang. Melambung melalui udara, dia bertujuan untuk lengan renjer taman Victoria Sissons, dan, membuat pendaratan selamat, scampers di sepanjang lengan bajunya untuk pilfer poketnya. Tatal ke bawah untuk video. Sedia, mantap: Hazel si tupai merah bersiap untuk melompat ke lengan ranger Victoria Sisson. We have lift off: Red squirrels can jump up to 20ft through tree canopies when they spot food. Geronimo: Paws outstretched Hazel the red squirrel soars through the air in her quest to find tasty nuts to nibble on at the country park she calls home. Tupai merah boleh melompat sehingg

In [17]:
def similarity_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-tpu-general/t5-data-v2/snli.tsv',
            'gs://mesolitica-tpu-general/t5-data-v2/mnli.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def similarity_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': ex['question'],
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('similarity_dataset')
t5.data.TaskRegistry.add(
    'similarity_dataset',
    dataset_fn = similarity_dataset,
    splits = ['train'],
    text_preprocessor = [similarity_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [18]:
nq_task = t5.data.TaskRegistry.get('similarity_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 256, 'targets': 32}
)
for ex in tfds.as_numpy(ds.take(1)):
    print(ex)

{'inputs_plaintext': b'ayat1: Tiga wanita tersenyum dan membuat kek cawan. ayat2: Ketiga wanita tersebut senang dengan baking cupcakes.', 'inputs': array([13694,   201,    31,  5082,   302, 10051,    22,   190, 14912,
       12607,     3, 13694,   215,    31,  8165,   302,   212,  3539,
          28, 27633,  9987, 19710,    16,     3,     1]), 'targets_plaintext': b'berkait', 'targets': array([19191,     1])}


In [19]:
def en_ms_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-tpu-general/t5-data-v2/en-ms.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def en_ms_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['terjemah Inggeris ke Melayu: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('en_ms_dataset')
t5.data.TaskRegistry.add(
    'en_ms_dataset',
    dataset_fn = en_ms_dataset,
    splits = ['train'],
    text_preprocessor = [en_ms_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [20]:
nq_task = t5.data.TaskRegistry.get('en_ms_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 1024, 'targets': 1024}
)
for ex in tfds.as_numpy(ds.take(1)):
    print(ex)

{'inputs_plaintext': b'terjemah Inggeris ke Melayu: He asked to remain anonymous because he fears retribution from Bell. "The company is just sort of giving the CRTC the finger," he adds. Often, the new TV plans come with none of the deals typically offered, such as discounts for customers getting multiple services. So subscribers find themselves stuck paying full price for everything including necessities like PVR rental and installation costs. In Bell\'s case, customers getting its $24.95 Fibe Starter pack also have to shell out money for Bell internet service. Pick-and-pay channels and theme pack prices are extra charges on top of all the fees. For example, Rogers is only offering added small channel packages at this point, which can run as high as $18 each. Many customers who have crunched the numbers are not happy with the results. "Checked out the \'skinny\' being offered by Rogers. Just as I suspected, getting less and paying more," commented one CBC reader.', 'inputs': array([ 

In [21]:
def ms_en_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'gs://mesolitica-tpu-general/t5-data-v2/ms-en.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def ms_en_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['terjemah Melayu ke Inggeris: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )


t5.data.TaskRegistry.remove('ms_en_dataset')
t5.data.TaskRegistry.add(
    'ms_en_dataset',
    dataset_fn = ms_en_dataset,
    splits = ['train'],
    text_preprocessor = [ms_en_preprocessor],
    sentencepiece_model_path = vocab,
    postprocess_fn = t5.data.postprocessors.lower_text,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [22]:
nq_task = t5.data.TaskRegistry.get('ms_en_dataset')
ds = nq_task.get_dataset(
    split = 'train', sequence_length = {'inputs': 1024, 'targets': 1024}
)
for ex in tfds.as_numpy(ds.take(1)):
    print(ex)

{'inputs_plaintext': b'terjemah Melayu ke Inggeris: Lao', 'inputs': array([   13, 26087,  1550,    55,  2040,    31,   710,   162,     1]), 'targets_plaintext': b'Lao', 'targets': array([710, 162,   1])}
