In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin
import sentencepiece as spm
from glob import glob
import os

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.ms-en.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

True

In [8]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/normalization/twitter-normalized.json

import json

with open('twitter-normalized.json') as fopen:
    data = json.load(fopen)
    
X, Y = [], []
for i in tqdm(data):
    if len(i[0]) and len(i[1]):
        X.append(i[0])
        Y.append(i[1])

100%|██████████| 689290/689290 [00:00<00:00, 1098628.64it/s]


In [9]:
import re

def cleaning(string):
    string = string.replace('\n', ' ').replace('\t', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [10]:
with tf.io.gfile.GFile('twitter-norm.tsv', "w") as outfile:
    for i in tqdm(range(len(X))):
        l = cleaning(X[i])
        r = cleaning(Y[i])
        outfile.write("%s\t%s\n" % (l, r))

100%|██████████| 689290/689290 [00:18<00:00, 37042.44it/s]


In [12]:
def social_media_norm_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'twitter-norm.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds

def social_media_norm_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['sosial media norm: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )

In [16]:
t5.data.TaskRegistry.remove('social_media_norm_dataset')
t5.data.TaskRegistry.add(
    'social_media_norm_dataset',
    dataset_fn = social_media_norm_dataset,
    splits = ['train'],
    text_preprocessor = [social_media_norm_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)


In [17]:
nq_task = t5.data.TaskRegistry.get("social_media_norm_dataset")
ds = nq_task.get_dataset(split='paraphrase.tsv', sequence_length={"inputs": 1024, "targets": 1024})
r = tfds.as_numpy(ds)

In [20]:
next(r)

{'inputs_plaintext': b'sosial media norm: @nawmianIdris Ada dua jalan kan dulu takde rb polis. Ioi dgn putrajaya sentral. Now da ada.',
 'inputs': array([  969,   295, 19048,    31,  1121,   476,   806,  1676,    47,
          133,  8354,    16,  2540,   192,   489,    13,   103,  5891,
         2427,   429,    13, 13060,   612,     3,    59,  9303,    13,
           79,   353,   152,    13, 19152,  8948,  2234,  6313,     3,
         1514,    13,   530,    97,     3,     1]),
 'targets_plaintext': b'@nawmianIdris Ada dua jalan kan dahulu tiada rb polis . Ioi dengan putrajaya sentral . Now da ada .',
 'targets': array([ 1121,   476,   806,  1676,    47,   133,  8354,    16,  2540,
          192,   489,    13,   103,  3544,  3006,    13, 13060,   612,
           13,     3,    59,  9303,    28,    13, 19152,  8948,  2234,
         6313,    13,     3,  1514,    13,   530,    97,    13,     3,
            1])}