In [1]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/train-en-ms.tar.gz
# !tar -zxf train-en-ms.tar.gz

In [2]:
with open('train-en/left.txt') as fopen:
    left = fopen.read().split('\n')
    
with open('train-en/right.txt') as fopen:
    right = fopen.read().split('\n')

In [3]:
len(left), len(right)

(3807616, 3807616)

In [4]:
left = left[:800000]
right = right[:800000]

In [5]:
from tqdm import tqdm
import re

def cleaning(string):
    string = string.replace('\n', ' ').replace('\t', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [6]:
import tensorflow as tf
import tensorflow_datasets as tfds
from t5.data import preprocessors as prep
import functools
import t5
import gin
import sentencepiece as spm
from glob import glob
import os

gin.parse_config_file('pretrained_models_base_operative_config.gin')
vocab = 'sp10m.cased.ms-en.model'
sp = spm.SentencePieceProcessor()
sp.Load(vocab)

True

In [7]:
with tf.io.gfile.GFile('en-ms.tsv', "w") as outfile:
    for i in range(len(left)):
        l = cleaning(left[i])
        r = cleaning(right[i])
        outfile.write("%s\t%s\n" % (l, r))

In [8]:
def en_ms_dataset(split, shuffle_files = False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'en-ms.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults = ['', ''],
            field_delim = '\t',
            use_quote_delim = False,
        ),
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds

def en_ms_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['terjemah Inggeris ke Melayu: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls = tf.data.experimental.AUTOTUNE,
    )

In [9]:
t5.data.TaskRegistry.remove('en_ms_dataset')
t5.data.TaskRegistry.add(
    'en_ms_dataset',
    dataset_fn = en_ms_dataset,
    splits = ['train'],
    text_preprocessor = [en_ms_preprocessor],
    sentencepiece_model_path = vocab,
    metric_fns = [t5.evaluation.metrics.accuracy],
)


In [10]:
nq_task = t5.data.TaskRegistry.get("en_ms_dataset")
ds = nq_task.get_dataset(split='en-ms.tsv', sequence_length={"inputs": 1024, "targets": 1024})
r = tfds.as_numpy(ds)

In [11]:
next(r)

{'inputs_plaintext': b'terjemah Inggeris ke Melayu: Lurie says the current real estate dip may last longer than previous lulls, but she says the numbers were worse in 2009. Homes priced under $500,000 are still selling relatively well. Lurie says it\'s the luxury homes and apartments that are suffering. New listings in the $600,000-plus category are on the rise, but sales are down from last year. CREB says that price range represented about 18 per cent of sales compared with 20 per cent last year. "With more options in the higher-end of the market, sellers will need to consider their competition as well as their goals regarding a sell date," said CREB president Corinne Lyall in a release. "This will influence the pricing strategy they agree upon with their real estate professional". Justin Bobier, with Crystal Creek Homes, says his company\'s luxury home sales are down 50 per cent this year. He says a lot of buyers are playing the waiting game. "Reality is we\'re not going to see a pri

In [12]:
!rm -rf train-en