In [1]:
# !wget https://huggingface.co/datasets/mesolitica/ms-en/resolve/main/ms-en-left.train
# !wget https://huggingface.co/datasets/mesolitica/ms-en/resolve/main/ms-en-right.train

In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
from glob import glob
import json
import tensorflow as tf
import malaya

 The versions of TensorFlow you are currently using is 2.6.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons
TensorFlow Addons has compiled its custom ops against TensorFlow 2.4.0, and there are no compatibility guarantees between the two versions. 
This means that you might get segfaults when loading the custom op, or other kind of low-level errors.
 If you do, do not file an issue on Github. This is a known limitation.

It might help you to fallback to pure Python ops with TF_ADDONS_PY_OPS . To do that, see https://github.com/tensorflow/addons#gpucpu-custom-ops 

You can also change the TensorFlow version installed on your system. You would need a TensorFlow 

In [3]:
fast_text = malaya.language_detection.fasttext()



In [4]:
augmented = glob('augmented-ms-en-*.json')
augmented = [f for f in augmented if 'test' not in f]
augmented

['augmented-ms-en-v2.json',
 'augmented-ms-en-3.json',
 'augmented-ms-en-2.json',
 'augmented-ms-en-v3.json',
 'augmented-ms-en-1.json']

In [5]:
from tqdm import tqdm

lefts, rights = [], []

for file in augmented:
    with open(file) as fopen:
        data = json.load(fopen)

    for i in tqdm(range(len(data['ms']))):
        if len(data['ms'][i]) and len(data['en'][i]):
            lefts.append(data['ms'][i])
            rights.append(data['en'][i])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 84178/84178 [00:00<00:00, 2642270.58it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 335872/335872 [00:00<00:00, 2557029.31it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 138131/138131 [00:00<00:00, 2533467.17it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 419750/419750 [00:00<00:00, 2766708.68it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 273070/273070 [00:00<00:00, 2531599.37it/s]


In [6]:
with open('ms-en-left.train') as fopen:
    left = fopen.read().split('\n')

with open('ms-en-right.train') as fopen:
    right = fopen.read().split('\n')


for i in tqdm(range(len(left))):
    if len(left[i]) and len(right[i]):
        lefts.append(left[i])
        rights.append(right[i])

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 3252225/3252225 [00:00<00:00, 3641086.37it/s]


In [15]:
import re

def cleaning(string):
    string = string.replace('\n', ' ').replace('\t', ' ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [11]:
!rm -rf t5-noisy-ms-en

In [12]:
!mkdir t5-noisy-ms-en

In [13]:
from sklearn.utils import shuffle

lefts, rights = shuffle(lefts, rights)

In [16]:
batch_size = 500000
for i in range(0, len(lefts), batch_size):
    b_left = lefts[i: i + batch_size]
    b_right = rights[i: i + batch_size]
    with tf.io.gfile.GFile(f't5-noisy-ms-en/{i}.tsv', 'w') as outfile:
        for k in range(len(b_left)):
            l = cleaning(b_left[k])
            r = cleaning(b_right[k])
            outfile.write("%s\t%s\n" % (l, r))

In [17]:
glob('t5-noisy-ms-en/*.tsv')

['t5-noisy-ms-en/1500000.tsv',
 't5-noisy-ms-en/2500000.tsv',
 't5-noisy-ms-en/0.tsv',
 't5-noisy-ms-en/4000000.tsv',
 't5-noisy-ms-en/3000000.tsv',
 't5-noisy-ms-en/4500000.tsv',
 't5-noisy-ms-en/2000000.tsv',
 't5-noisy-ms-en/500000.tsv',
 't5-noisy-ms-en/1000000.tsv',
 't5-noisy-ms-en/3500000.tsv']

In [18]:
import tensorflow as tf
import tensorflow_datasets as tfds
import t5
import functools
from t5 import models

In [19]:
def translation_dataset(split, shuffle_files=False):
    del shuffle_files
    ds = tf.data.TextLineDataset(glob('t5-noisy-ms-en/*.tsv'))

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults=['', ''],
            field_delim='\t',
            use_quote_delim=False,
        ),
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def translation_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['terjemah Melayu ke Inggeris: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
    )

In [20]:
import seqio

DEFAULT_SPM_PATH = vocab = 'sp10m.cased.ms-en.model'
DEFAULT_EXTRA_IDS = 100


def get_default_vocabulary():
    return seqio.SentencePieceVocabulary(DEFAULT_SPM_PATH, DEFAULT_EXTRA_IDS)

In [21]:
t5.data.TaskRegistry.remove('translation_dataset')

t5.data.TaskRegistry.add(
    'translation_dataset',
    dataset_fn=translation_dataset,
    splits=['train'],
    text_preprocessor=[translation_preprocessor],
    postprocess_fn=t5.data.postprocessors.lower_text,
    metric_fns=[t5.evaluation.metrics.accuracy],
    output_features = seqio.Feature(get_default_vocabulary())
)
t5.data.MixtureRegistry.remove('translation_bahasa')
t5.data.MixtureRegistry.add(
    'translation_bahasa',
    ['translation_dataset'],
    default_rate=1.0,
)

<seqio.dataset_providers.Mixture at 0x7fc230897250>

In [22]:
nq_task = t5.data.TaskRegistry.get("translation_dataset")
ds = nq_task.get_dataset(split='knowledge-graph.tsv', sequence_length={"inputs": 512, "targets": 512})
r = tfds.as_numpy(ds)

2022-07-06 16:22:22.222941: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-07-06 16:22:22.222966: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: huseincomel-desktop
2022-07-06 16:22:22.222970: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: huseincomel-desktop
2022-07-06 16:22:22.223035: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: Not found: was unable to find libcuda.so DSO loaded into this program
2022-07-06 16:22:22.223053: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.129.6
2022-07-06 16:22:22.223216: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F 

In [23]:
r = r._make_iterator_fn()

In [24]:
next(r)

2022-07-06 16:22:22.954282: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


{'inputs_pretokenized': b'terjemah Melayu ke Inggeris: This because ketika tree itu was kecil and sudah ngeluarin buah, saya hanya menggunakan baja organik he," katanya.',
 'inputs': array([   13, 26087,  1550,    55,  2040,    31,   263,   229,   123,
         7421,    37,    39,   439,    20,   391,    13, 14056, 12995,
          153,  1508,    14,    67,   169,   311, 18383, 13555,    57,
           14,     6,   194,     3,     1], dtype=int32),
 'targets_pretokenized': b'This is because when the tree was young and the fruit was out, I used only organic fertilizers, "he said.',
 'targets': array([  263,    26,   229,   146,    15,  7421,    39,  1169,    20,
           15,  7307,    39,   131,    14,    59,   419,   220, 14017,
        24112,    16,    14,    13,     6,   609,    49,     3,     1],
       dtype=int32)}