In [1]:
from tqdm import tqdm
import re

def cleaning(string):
    string = string.replace('\n', ' ').replace('\t', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin
import sentencepiece as spm
from glob import glob
import os
import json

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.ms-en.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

True

In [3]:
directory = '/home/husein/pure-text/'
paws = ['paws-train.json.translated', 'paws-dev.json.translated',
        'paws-test.json.translated']

In [4]:
X, Y = [], []
for file in paws:
    x, y = [], []
    with open(os.path.join(directory, file)) as fopen:
        for l in fopen:
            l = json.loads(l)
            x.append(l['sentence1_ms'])
            y.append(l['sentence2_ms'])
            x.append(l['sentence2_ms'])
            y.append(l['sentence1_ms'])
            
    print(len(x), len(y))
    X.extend(x)
    Y.extend(y)

43658 43658
7078 7078
7072 7072


In [5]:
parasci_src = ['train.src.truecase.translated', 
               'train.src.1.truecase.translated',
               'val.src.truecase.translated',
               'val.src.1.truecase.translated']
parasci_tgt = ['train.tgt.truecase.translated', 
               'train.tgt.1.truecase.translated',
               'val.src.truecase.translated',
               'val.tgt.1.truecase.translated']

In [6]:
for i in range(len(parasci_src)):
    file_src = os.path.join(directory, parasci_src[i])
    file_tgt = os.path.join(directory, parasci_tgt[i])
    with open(file_src) as fopen:
        left = fopen.read().split('\n')
    with open(file_tgt) as fopen:
        right = fopen.read().split('\n')
    if len(left) < len(right):
        right = right[:len(left)]
    else:
        left = left[:len(right)]
        
    x, y = [], []
    for k in range(len(left)):
        if len(left[k]) and len(right[k]):
            l_left = json.loads(left[k])
            l_right = json.loads(right[k])
            x.append(l_left['ms'])
            y.append(l_right['ms'])
            x.append(l_right['ms'])
            y.append(l_left['ms'])
            
    print(len(x), len(y))
    X.extend(x)
    Y.extend(y)

57766 57766
275234 275234
5506 5506
7360 7360


In [7]:
len(X), len(Y)

(403674, 403674)

In [8]:
with tf.io.gfile.GFile('paraphrase.tsv', "w") as outfile:
    for i in tqdm(range(len(X))):
        if len(X) and len(Y):
            l = cleaning(X[i])
            r = cleaning(Y[i])
            outfile.write("%s\t%s\n" % (l, r))

100%|██████████| 403674/403674 [00:08<00:00, 49100.94it/s]


In [11]:
def paraphrase_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'paraphrase.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds

def paraphrase_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['parafrasa: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )

In [13]:
t5.data.TaskRegistry.remove('paraphrase_dataset')
t5.data.TaskRegistry.add(
    'paraphrase_dataset',
    dataset_fn = paraphrase_dataset,
    splits = ['train'],
    text_preprocessor = [paraphrase_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)


In [14]:
nq_task = t5.data.TaskRegistry.get("paraphrase_dataset")
ds = nq_task.get_dataset(split='paraphrase.tsv', sequence_length={"inputs": 1024, "targets": 1024})
r = tfds.as_numpy(ds)

In [15]:
next(r)

{'inputs_plaintext': b'parafrasa: Pada 30 Disember 1888, beliau berkahwin dengan Susie J. Clarke di Ayer, Massachusetts.',
 'inputs': array([  445,  4435,   722,    31,   206,   480,  1386,   375,  5782,
           14,   596,  3887,    28, 24529,    81,   638,     3, 18094,
           24, 13999,    14,  2470,     3,     1]),
 'targets_plaintext': b'Pada 30 Disember 1888, Susie berkahwin dengan Clarke Brown di Ayer, Massachusetts.',
 'targets': array([  206,   480,  1386,   375,  5782,    14, 24529,    81,  3887,
           28, 18094,  1659,    24, 13999,    14,  2470,     3,     1])}