In [1]:
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-storage.json'

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin
import sentencepiece as spm
from glob import glob
import os

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.t5.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

True

In [3]:
def stemming_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        ['gs://mesolitica-general/t5-data/stemming.tsv']
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds

def stemming_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['punca: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )

In [4]:
t5.data.TaskRegistry.remove('stemming_dataset')
t5.data.TaskRegistry.add(
    'stemming_dataset',
    dataset_fn = stemming_dataset,
    splits = ['train'],
    text_preprocessor = [stemming_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

  "get_sentencepiece_model_path is deprecated. Please pass the mixture or "


In [5]:
from tqdm import tqdm

nq_task = t5.data.TaskRegistry.get("stemming_dataset")
ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 768, "targets": 768})

batch_size, index, part = 100000, 0, 0
fopen = open(f'stemming-{part}.parse', 'w')
for ex in tqdm(tfds.as_numpy(ds)):
    i = sp.DecodeIds(ex['inputs'].tolist())
    t = sp.DecodeIds(ex['targets'].tolist())
    text = f'{i} [[EENNDD]] {t}\n'
    fopen.write(text)
    
    if index == batch_size:
        fopen.close()
        part += 1
        index = 0
        fopen = open(f'stemming-{part}.parse', 'w')
    
    index += 1
    
fopen.close()

200000it [02:08, 1558.24it/s]


In [9]:
def synonym_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        ['gs://mesolitica-general/t5-data/synonyms.tsv']
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def synonym_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['sinonim: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )

In [10]:
t5.data.TaskRegistry.remove('synonym_dataset')
t5.data.TaskRegistry.add(
    'synonym_dataset',
    dataset_fn = synonym_dataset,
    splits = ['train'],
    text_preprocessor = [synonym_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [11]:
from tqdm import tqdm

nq_task = t5.data.TaskRegistry.get("synonym_dataset")
ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 768, "targets": 768})

batch_size, index, part = 100000, 0, 0
fopen = open(f'synonym-{part}.parse', 'w')
for ex in tqdm(tfds.as_numpy(ds)):
    i = sp.DecodeIds(ex['inputs'].tolist())
    t = sp.DecodeIds(ex['targets'].tolist())
    text = f'{i} [[EENNDD]] {t}\n'
    fopen.write(text)
    
    if index == batch_size:
        fopen.close()
        part += 1
        index = 0
        fopen = open(f'synonym-{part}.parse', 'w')
    
    index += 1
    
fopen.close()

150000it [02:02, 1225.62it/s]
