In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin
import sentencepiece as spm
from glob import glob
import os

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.t5.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

True

In [2]:
files = glob('../pure-text/dumping*.txt')
files = [f for f in files if 'twitter' not in f and 'common' not in f]
files

['../pure-text/dumping-instagram.txt',
 '../pure-text/dumping-parliament.txt',
 '../pure-text/dumping-iium.txt',
 '../pure-text/dumping-wiki.txt',
 '../pure-text/dumping-news.txt',
 '../pure-text/dumping-watpadd.txt',
 '../pure-text/dumping-pdf.txt']

In [3]:
os.path.split(files[0])[1]

'dumping-instagram.txt'

In [4]:
import re

def cleaning(string):
    string = string.replace('\n', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [5]:
for file in files:
    with open(file) as fopen:
        data = list(filter(None, fopen.read().split('\n')))
    print(file, len(data))
    s = os.path.split(file)[1]
    filename = f'{s}.tsv'
    with tf.io.gfile.GFile(filename, 'w') as outfile:
        for i in range(len(data)):
            outfile.write('%s\t%s\n' % ('', cleaning(data[i])))

../pure-text/dumping-instagram.txt 3226766
../pure-text/dumping-parliament.txt 890823
../pure-text/dumping-iium.txt 1121978
../pure-text/dumping-wiki.txt 1715551
../pure-text/dumping-news.txt 1791784
../pure-text/dumping-watpadd.txt 1445379
../pure-text/dumping-pdf.txt 596417


In [1]:
glob('dumping*.txt.tsv')

NameError: name 'glob' is not defined

In [7]:
def dumping_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        glob('dumping*.tsv')
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['title', 'text'], ex)))
    return ds

In [8]:
t5.data.TaskRegistry.remove('dumping_dataset')
t5.data.TaskRegistry.add(
    'dumping_dataset',
    dataset_fn = dumping_dataset,
    splits = ['train'],
    text_preprocessor = functools.partial(
        t5.data.preprocessors.rekey,
        key_map = {'inputs': None, 'targets': 'text'},
    ),
    token_preprocessor = t5.data.preprocessors.unsupervised,
    sentencepiece_model_path = vocab,
    metric_fns = [],
)

  "get_sentencepiece_model_path is deprecated. Please pass the mixture or "


In [None]:
from tqdm import tqdm

for k in range(15):
    
    nq_task = t5.data.TaskRegistry.get("dumping_dataset")
    ds = nq_task.get_dataset(split='qa.tsv', sequence_length={"inputs": 768, "targets": 768})
    batch_size, index, part = 100000, 0, 0
    
    fopen = open(f'dumping-{k}-{part}.parse', 'w')
    for ex in tqdm(tfds.as_numpy(ds)):
        i = sp.DecodeIds(ex['inputs'].tolist())
        t = sp.DecodeIds(ex['targets'].tolist())
        text = f'{i} [[EENNDD]] {t}\n'
        fopen.write(text)

        if index == batch_size:
            fopen.close()
            part += 1
            index = 0
            fopen = open(f'dumping-{k}-{part}.parse', 'w')

        index += 1

    fopen.close()

INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 


241049it [34:24, 116.74it/s]


INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 


241049it [33:56, 118.36it/s]


INFO:tensorflow:tokens_length=1137 inputs_length=1024 targets_length=229 noise_density=0.15 mean_noise_span_length=3.0 


35700it [05:23, 138.76it/s]

In [10]:
!tail -n 5 dumping-0.parse

menjawab , " yang mudah hilang ‘ ), mereka ber© dengan mereka karena tahta , memuliakan penguasa tapi meremehkan rakyatnya ". ( Tarikhuna Al Adzim ) . Bntr lgi # kartiniday̳mpong d skoHP . Hasil hujung minggu # tombakketam#carimakan#kerapu#udang#ketam#ketamsuri Ayam Rendang . Tangan dah mula gatal2 niSapa yang nak naikkan seri baju boleh hantar mulai - Terwujudnya bisyarah atas pembebasan Konstanstinopel bukanlah peristiwa kebetulan , melainkan sebuah penak■kan besar yang dengan dirintis Sejak kecil , Muhammad Al Fatih telah dididik oleh ulama pilihan , Syekh Aaq Syamsuddin yang mengajarkannya banyak disiplin ilmu hingga tumbuh menjadi remaja dengan kepribadian percikan unggul . Ia adalah nabati pernah meninggalkan shalat wajib lima waktu , shalat sunnah rawatib , bahkan shalat tahajjud . Tidak hanyakan kemampuan essence juga dibentuk mental pembebas melalui kameo para penakluk , kisah syahid danNFDP serta senantiasa diingatkan mengenai bisyarah Rasulullah dan terwujudnya janji