In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin
import sentencepiece as spm
from glob import glob
import os

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.ms-en.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

True

In [6]:
files = [
    '/home/husein/pure-text/dumping-iium.txt',
    '/home/husein/pure-text/dumping-twitter.txt',
    '/home/husein/pure-text/dumping-instagram.txt'
]

In [7]:
os.path.split(files[0])[1]

'dumping-iium.txt'

In [8]:
import re

def cleaning(string):
    string = string.replace('\n', ' ').replace('\t', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [9]:
for file in files:
    with open(file) as fopen:
        data = list(filter(None, fopen.read().split('\n')))
    print(file, len(data))
    s = os.path.split(file)[1]
    filename = f'{s}.tsv'
    with tf.io.gfile.GFile(filename, 'w') as outfile:
        for i in range(len(data)):
            c = cleaning(data[i])
            if len(c) > 20:
                outfile.write('%s\t%s\n' % ('', c))

/home/husein/pure-text/dumping-iium.txt 1121978
/home/husein/pure-text/dumping-twitter.txt 10692819
/home/husein/pure-text/dumping-instagram.txt 3226766


In [14]:
def dumping_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        ['dumping-iium.txt.tsv', 'dumping-twitter.txt.tsv']
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['title', 'text'], ex)))
    return ds

In [15]:
t5.data.TaskRegistry.remove('dumping_dataset')
t5.data.TaskRegistry.add(
    'dumping_dataset',
    dataset_fn = dumping_dataset,
    splits = ['train'],
    text_preprocessor = functools.partial(
        t5.data.preprocessors.rekey,
        key_map = {'inputs': None, 'targets': 'text'},
    ),
    token_preprocessor = t5.data.preprocessors.unsupervised,
    sentencepiece_model_path = vocab,
    metric_fns = [],
)

In [16]:
nq_task = t5.data.TaskRegistry.get("dumping_dataset")
ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 1024, "targets": 1024})
r = tfds.as_numpy(ds)

INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 


In [17]:
next(r)

{'inputs': array([  203, 27499,     3,  9827,   266,   420, 32099,   108,   486,
        13597,     3, 32098,   103,   322,   280,     3,  1236,   354,
         2202,  2683,    25,  6572, 10897,    14,  1887,   103,    13,
        21206,   205,  9132,    13, 11965,     3,   335,   155,    17,
          200,     3,  1616,    17,   736,    14, 20086, 12231,    13,
        11965,     3,  2569,    13, 21206,   205,  9132,   200,  6765,
            3, 16602,   317,   802,  1428,     3, 20300,   802,  1330,
         1229,     3, 20300,    24, 32097,  1830,   271,  1817,   140,
         1644,     3,  5746,  1518, 17294,    14,    13,   117, 32096,
           14,    13, 17235,  4878, 32095, 10294,    24,    50, 32094,
           13,  2906, 10891,  7277,    14,   802,  3249,     3, 16602,
          317,    97,    24,  1231,   279, 32093,     3, 16602,   317,
          532,   987,  3871,  4755,     3, 20300, 10294, 12120,   117,
         2695, 10773,    41,   176,     3, 32092,   100,   155, 320