In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
import tensorflow_datasets as tfds

(ds_train, ds_test, ds_validation),ds_info = tfds.load("glue/mrpc",split=["train","test","validation"],with_info=True)


mrpc_train = []
mrpc_test = []
mrpc_val = []


for example in ds_train:
    if(example["label"] == 1):
        mrpc_train.append((example["sentence1"].numpy().decode(),example["sentence2"].numpy().decode()))

for example in ds_validation:
  
    if(example["label"] == 1):
        mrpc_val.append((example["sentence1"].numpy().decode(),example["sentence2"].numpy().decode()))
        
for example in ds_test:
  
    if(example["label"] == 1):
        mrpc_test.append((example["sentence1"].numpy().decode(),example["sentence2"].numpy().decode()))

2022-10-23 15:14:44.242393: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-23 15:14:44.246138: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-23 15:14:44.246749: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-10-23 15:14:44.247325: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [3]:
len(mrpc_train), len(mrpc_test), len(mrpc_val)

(2474, 0, 279)

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/finetune-translation-t5-small-standard-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained('mesolitica/finetune-translation-t5-small-standard-bahasa-cased')

In [6]:
_ = model.cuda()

In [8]:
batch = [b for b in mrpc_train[0] if len(b.split()) <= 100]
batch

['Spider-Man snatched $ 114.7 million in its debut last year and went on to capture $ 403.7 million .',
 'Spider-Man , rated PG-13 , snatched $ 114.7 million in its first weekend and went on to take in $ 403.7 million .']

In [10]:
t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
inputs = tokenizer(t, return_tensors="pt", padding = True)
for k in inputs.keys():
    inputs[k] = inputs[k].cuda()

translated_tokens = model.generate(**inputs, max_length=500)
decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

In [15]:
from tqdm import tqdm

translated_mrpc_train = []
for i in tqdm(range(len(mrpc_train))):
    batch = [b for b in mrpc_train[i] if len(b.split()) <= 100]
    t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
    inputs = tokenizer(t, return_tensors="pt", padding = True)
    for k in inputs.keys():
        inputs[k] = inputs[k].cuda()

    translated_tokens = model.generate(**inputs, max_length=500)
    decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    
    translated_mrpc_train.append(
        {
            'en': mrpc_train[i],
            'ms': decoded,
        }
    )

100%|███████████████████████████████████████| 2474/2474 [05:16<00:00,  7.83it/s]


In [21]:
translated_mrpc_train[0]

{'en': ('Spider-Man snatched $ 114.7 million in its debut last year and went on to capture $ 403.7 million .',
  'Spider-Man , rated PG-13 , snatched $ 114.7 million in its first weekend and went on to take in $ 403.7 million .'),
 'ms': ['Spider-Man meraih $114.7 juta dalam debut tahun lalu dan terus meraih $403.7 juta.',
  'Spider-Man, dinilai PG-13, meraih $114,7 juta pada hujung minggu pertamanya dan terus mengambil $403,7 juta.']}

In [19]:
translated_mrpc_val = []
for i in tqdm(range(len(mrpc_val))):
    batch = [b for b in mrpc_val[i] if len(b.split()) <= 100]
    t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
    inputs = tokenizer(t, return_tensors="pt", padding = True)
    for k in inputs.keys():
        inputs[k] = inputs[k].cuda()

    translated_tokens = model.generate(**inputs, max_length=500)
    decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    
    translated_mrpc_val.append(
        {
            'en': mrpc_val[i],
            'ms': decoded,
        }
    )

100%|█████████████████████████████████████████| 279/279 [00:35<00:00,  7.88it/s]


In [20]:
translated_mrpc_val[0]

{'en': ("The show 's closure affected third-quarter earnings per share by a penny .",
  'The company said this impacted earnings by a penny a share .'),
 'ms': ['Penutupan rancangan itu menjejaskan pendapatan suku ketiga sesaham dengan satu sen.',
  'Syarikat itu berkata ini memberi kesan kepada pendapatan oleh satu sen sesaham.']}

In [22]:
import json

with open('mrpc-train.json', 'w') as fopen:
    json.dump(translated_mrpc_train, fopen)

In [23]:
with open('mrpc-val.json', 'w') as fopen:
    json.dump(translated_mrpc_val, fopen)