In [3]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/text-similarity/mnli/multinli_1.0_dev_matched.json.translated
# !wget https://f000.backblazeb2.com/file/malay-dataset/text-similarity/mnli/multinli_1.0_dev_mismatched.json.translated
# !wget https://f000.backblazeb2.com/file/malay-dataset/text-similarity/mnli/multinli_1.0_train.json.translated

In [4]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/text-similarity/snli/snli_1.0_train.json.translate
# !wget https://f000.backblazeb2.com/file/malay-dataset/text-similarity/snli/snli_1.0_test.json.translate
# !wget https://f000.backblazeb2.com/file/malay-dataset/text-similarity/snli/snli_1.0_dev.json.translate

In [16]:
from glob import glob
import tensorflow as tf
import json
from pathlib import Path
import re

def cleaning(string):
    string = string.replace('\n', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [23]:
files = glob('multinli_1.0*')
files

['multinli_1.0_train.json.translated',
 'multinli_1.0_dev_mismatched.json.translated',
 'multinli_1.0_dev_matched.json.translated']

In [28]:
f = open(files[-2])

In [30]:
l = json.loads(next(f))
l

{'gold_label': 'contradiction',
 'promptID': '133794',
 'pairID': '133794c',
 'genre': 'verbatim',
 'en': ["The answer has nothing to do with their cause, however, but with the simple fact that dictionaries are not exercises in bi-unique substitutability; in other words, if one of the senses of run is `operate' (as in She runs an engine factory ), that does not make it valid to assume that one can substitute operate for run in We run in the marathon every year .  Although recognizing this as a shortcoming of dictionaries and assigning it arbitrarily to what, for lack of a better term, we might call the  genius  of the language, might seem trivial to the casual observer, it is a valid matter for concern in the realm of lexicology.",
  'Dictionaries are indeed exercises in bi-unique substitutability.'],
 'ms': ["Jawapannya tidak ada kaitan dengan tujuan mereka, tetapi dengan fakta sederhana bahawa kamus bukan latihan dalam penggantian dua unik; dengan kata lain, jika salah satu deria lar

In [22]:
labels = {'contradiction': 'percanggahan', 'entailment': 'berkait'}

filename = 'snli.tsv'
files = glob('snli_1.0*')
with tf.io.gfile.GFile(filename, 'w') as outfile:
    for file in files:
        print(file)
        with open(file) as fopen:
            data = json.load(fopen)
        print(file, len(data))

        s = Path(file).stem
        for i in range(len(data)):
            if len(data[i]['ms']) != 2:
                continue

            label = labels.get(data[i]['gold_label'], data[i]['gold_label'])
            q = f"ayat1: {cleaning(data[i]['ms'][0])} ayat2: {cleaning(data[i]['ms'][1])}"
            outfile.write('%s\t%s\n' % (q, label))

snli_1.0_test.json.translate
snli_1.0_test.json.translate 10000
snli_1.0_train.json.translate
snli_1.0_train.json.translate 550152
snli_1.0_dev.json.translate
snli_1.0_dev.json.translate 10000


In [32]:
files = glob('multinli_1.0*')

filename = 'mnli.tsv'
with tf.io.gfile.GFile(filename, 'w') as outfile:
    for file in files:
        print(file)
        with open(file) as fopen:
            for l in fopen:
                l = json.loads(l)
                if len(l['ms']) != 2:
                    continue
                
                label = labels.get(l['gold_label'], l['gold_label'])
                q = f"ayat1: {cleaning(l['ms'][0])} ayat2: {cleaning(l['ms'][1])}"
                outfile.write('%s\t%s\n' % (q, label))

multinli_1.0_train.json.translated
multinli_1.0_dev_mismatched.json.translated
multinli_1.0_dev_matched.json.translated


In [33]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.ms-en.model'

In [34]:
def dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(['mnli.tsv', 'snli.tsv'])

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds

def preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {'inputs': ex['question'], 'targets': ex['answer']}

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )

In [35]:
files = ['mnli.tsv', 'snli.tsv']

t5.data.TaskRegistry.remove('similarity_dataset')
t5.data.TaskRegistry.add(
    'similarity_dataset',
    dataset_fn = dataset,
    splits = ['train'],
    text_preprocessor = [preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)

In [36]:
nq_task = t5.data.TaskRegistry.get("similarity_dataset")
ds = nq_task.get_dataset(split=file, sequence_length={"inputs": 1024, "targets": 1024})

In [None]:
r = tfds.as_numpy(ds)

In [None]:
next(r)