In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin
import sentencepiece as spm
from glob import glob
import os

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.ms-en.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

True

In [4]:
files = [
    '/home/husein/pure-text/dumping-parliament.txt',
    '/home/husein/pure-text/filtered-dumping-wiki.txt',
    '/home/husein/pure-text/filtered-dumping-academia.txt',
    '/home/husein/pure-text/dumping-news.txt'
]
files.extend(glob('/home/husein/pure-text/the-pile/*.txt'))

In [5]:
os.path.split(files[0])[1]

'dumping-parliament.txt'

In [6]:
import re

def cleaning(string):
    string = string.replace('\n', ' ').replace('\t', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [7]:
for file in files:
    with open(file) as fopen:
        data = list(filter(None, fopen.read().split('\n')))
    print(file, len(data))
    s = os.path.split(file)[1]
    filename = f'{s}.tsv'
    with tf.io.gfile.GFile(filename, 'w') as outfile:
        for i in range(len(data)):
            c = cleaning(data[i])
            if len(c) > 20:
                outfile.write('%s\t%s\n' % ('', c))

/home/husein/pure-text/dumping-parliament.txt 277157
/home/husein/pure-text/filtered-dumping-wiki.txt 2037602
/home/husein/pure-text/filtered-dumping-academia.txt 4086649
/home/husein/pure-text/dumping-news.txt 3781905
/home/husein/pure-text/the-pile/00.jsonl-16.translated.txt 1885251
/home/husein/pure-text/the-pile/00.jsonl-17.translated.txt 1901017
/home/husein/pure-text/the-pile/00.jsonl-18.translated.txt 1846019
/home/husein/pure-text/the-pile/00.jsonl-0.translated.txt 1111918
/home/husein/pure-text/the-pile/00.jsonl-31.translated.txt 1903699
/home/husein/pure-text/the-pile/00.jsonl-11.translated.txt 1172171
/home/husein/pure-text/the-pile/00.jsonl-8.translated.txt 1642571
/home/husein/pure-text/the-pile/00.jsonl-28.translated.txt 1830271
/home/husein/pure-text/the-pile/00.jsonl-26.translated.txt 1810003
/home/husein/pure-text/the-pile/00.jsonl-6.translated.txt 1555877
/home/husein/pure-text/the-pile/00.jsonl-25.translated.txt 1901141
/home/husein/pure-text/the-pile/00.jsonl-32.tra

In [7]:
def dumping_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        ['dumping-parliament.txt.tsv', 'filtered-dumping-wiki.txt.tsv']
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['title', 'text'], ex)))
    return ds

In [8]:
t5.data.TaskRegistry.remove('dumping_dataset')
t5.data.TaskRegistry.add(
    'dumping_dataset',
    dataset_fn = dumping_dataset,
    splits = ['train'],
    text_preprocessor = functools.partial(
        t5.data.preprocessors.rekey,
        key_map = {'inputs': None, 'targets': 'text'},
    ),
    token_preprocessor = t5.data.preprocessors.unsupervised,
    sentencepiece_model_path = vocab,
    metric_fns = [],
)

  "get_sentencepiece_model_path is deprecated. Please pass the mixture or "


In [9]:
nq_task = t5.data.TaskRegistry.get("dumping_dataset")
ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 1024, "targets": 1024})
r = tfds.as_numpy(ds)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 


In [10]:
next(r)

{'inputs': array([   46, 32099,   331,   274,    29, 32098,  1123,   252,   167,
           46,    17,  2627,    27,  9211,    29,   470,    47,    76,
          112,   123,    16,    29,    51,    47,    76,   711,   123,
           46, 11059,   331,   274,    29,  1014,   250,    31,    46,
          167,    15,     7, 32097,   165,   611,  7476,     7,  5559,
          639,    47,   124,  1008,    27,   635,    16,    53,   287,
           17,    85,   567,    21,   178,  2155,   736,    28,   165,
          109,   119,     7,   704,   171,    17,   779,  2336,  1930,
          689,    17,    29, 32096,  4756,   308,   636,   610,   308,
        32095,     3,  1439,    15,    12,  2742, 32094,  2742, 11504,
          228, 29367,   326,  1133,    15,    12,   875,   288,  1299,
          511,   875,   477,   228, 12114,   326,   571,   402,  4523,
          249,  6214,    17,  5228,  2516,    32,   184,  2971,  8315,
           17,    64,    18,    40,    16,  2374,  6214,    65,  43