In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin

In [2]:
# tf.compat.v1.enable_eager_execution()

In [3]:
gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.t5.model'

In [4]:
from glob import glob

files = glob('t5-data/dumping-*.tsv')
files = [f for f in files if 'pair' not in f]
files

['t5-data/dumping-watpadd.tsv',
 't5-data/dumping-iium.tsv',
 't5-data/dumping-news.tsv',
 't5-data/dumping-wiki.tsv',
 't5-data/dumping-parliament.tsv',
 't5-data/dumping-pdf.tsv']

In [5]:
def dumping_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset([split])

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['title', 'text'], ex)))
    return ds

In [6]:
t5.data.TaskRegistry.remove('dumping_dataset')
t5.data.TaskRegistry.add(
    'dumping_dataset',
    dataset_fn = dumping_dataset,
    splits = ['train'],
    text_preprocessor = functools.partial(
        t5.data.preprocessors.rekey,
        key_map = {'inputs': None, 'targets': 'text'},
    ),
    token_preprocessor = t5.data.preprocessors.unsupervised,
    sentencepiece_model_path = vocab,
    metric_fns = [],
)

  "get_sentencepiece_model_path is deprecated. Please pass the mixture or "


In [16]:
from tqdm import tqdm

for file in files:
    print(file)
    f = os.path.split(file)[1]
    nq_task = t5.data.TaskRegistry.get("dumping_dataset")
    ds = nq_task.get_dataset(split=file, sequence_length={"inputs": 1024, "targets": 1024})
    results = []
    for ex in tqdm(tfds.as_numpy(ds)):
        results.append((ex['inputs'].tolist(), ex['targets'].tolist()))
    
    with open(f'{f}.parse', 'w') as fopen:
        json.dump(results, fopen)

t5-data/dumping-watpadd.tsv
INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 


24245it [04:05, 98.86it/s] 


t5-data/dumping-iium.tsv
INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 


19477it [03:12, 100.96it/s]


t5-data/dumping-news.tsv
INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 


48592it [05:42, 141.75it/s]


t5-data/dumping-wiki.tsv
INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 


45764it [05:34, 136.63it/s]


t5-data/dumping-parliament.tsv
INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 


12812it [02:28, 86.49it/s] 


t5-data/dumping-pdf.tsv
INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 


10101it [01:40, 100.24it/s]
