In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/finetune-translation-t5-small-standard-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained('mesolitica/finetune-translation-t5-small-standard-bahasa-cased')

In [3]:
import json

In [4]:
with open('true-case-parasci-arxiv-test.json') as fopen:
    data = json.load(fopen)
    
data[0]

['Zhang and Parker proposed a new Bio-Inspired predictive orientation decomposition representation, which was inspired by the biological research in human anatomy.',
 'Zhang and Parker implemented a Bio-Inspired predictive orientation decomposition using Mid-Level features to construct representations of people from 3D skeleton Trajectories, which is inspired by biological research in human anatomy.']

In [5]:
_ = model.cuda()

In [6]:
batch = [b for b in data[0] if len(b.split()) <= 256]
batch

['Zhang and Parker proposed a new Bio-Inspired predictive orientation decomposition representation, which was inspired by the biological research in human anatomy.',
 'Zhang and Parker implemented a Bio-Inspired predictive orientation decomposition using Mid-Level features to construct representations of people from 3D skeleton Trajectories, which is inspired by biological research in human anatomy.']

In [7]:
t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
inputs = tokenizer(t, return_tensors="pt", padding = True)
for k in inputs.keys():
    inputs[k] = inputs[k].cuda()

translated_tokens = model.generate(**inputs, max_length=500)
decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
decoded

['Zhang dan Parker mencadangkan perwakilan penguraian orientasi ramalan yang baru, yang diilhamkan oleh penyelidikan biologi dalam anatomi manusia.',
 'Zhang dan Parker melaksanakan penguraian orientasi ramalan Bio-Inspirasi menggunakan ciri-ciri Mid-Level untuk membina perwakilan orang dari Trajectories rangka 3D, yang diilhamkan oleh penyelidikan biologi dalam anatomi manusia.']

In [8]:
from tqdm import tqdm

translated_train_examples = []
for i in tqdm(range(len(data))):
    batch = [b for b in data[i] if len(b.split()) <= 256]
    t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
    inputs = tokenizer(t, return_tensors="pt", padding = True)
    for k in inputs.keys():
        inputs[k] = inputs[k].cuda()

    translated_tokens = model.generate(**inputs, max_length=500)
    decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    
    translated_train_examples.append(
        {
            'en': data[i],
            'ms': decoded,
        }
    )

100%|███████████████████████████████████████| 2549/2549 [03:27<00:00, 12.29it/s]


In [12]:
translated_train_examples[0]

{'en': ['Zhang and Parker proposed a new Bio-Inspired predictive orientation decomposition representation, which was inspired by the biological research in human anatomy.',
  'Zhang and Parker implemented a Bio-Inspired predictive orientation decomposition using Mid-Level features to construct representations of people from 3D skeleton Trajectories, which is inspired by biological research in human anatomy.'],
 'ms': ['Zhang dan Parker mencadangkan perwakilan penguraian orientasi ramalan yang baru, yang diilhamkan oleh penyelidikan biologi dalam anatomi manusia.',
  'Zhang dan Parker melaksanakan penguraian orientasi ramalan Bio-Inspirasi menggunakan ciri-ciri Mid-Level untuk membina perwakilan orang dari Trajectories rangka 3D, yang diilhamkan oleh penyelidikan biologi dalam anatomi manusia.']}

In [10]:
import json

with open('parasci-arxiv-test.json', 'w') as fopen:
    json.dump(translated_train_examples, fopen)

In [18]:
with open('true-case-parasci-arxiv-val.json') as fopen:
    data = json.load(fopen)
    
data[0]

['Despite the higher layers in deep neural network can involve the spatial context information around the objects due to the large receptive field, Zhou et al have shown that the practical receptive field is actually much smaller than the theoretical one.',
 'Although the wider receptive filed allows us to gather more context, Zhou et al showed that the actual size of the receptive fields in a CNN is much smaller than the theoretical size, especially in higher level layers.']

In [19]:
translated_train_examples = []
for i in tqdm(range(len(data))):
    batch = [b for b in data[i] if len(b.split()) <= 256]
    t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
    inputs = tokenizer(t, return_tensors="pt", padding = True)
    for k in inputs.keys():
        inputs[k] = inputs[k].cuda()

    translated_tokens = model.generate(**inputs, max_length=500)
    decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    
    translated_train_examples.append(
        {
            'en': data[i],
            'ms': decoded,
        }
    )

100%|███████████████████████████████████████| 3680/3680 [04:51<00:00, 12.62it/s]


In [20]:
with open('parasci-arxiv-val.json', 'w') as fopen:
    json.dump(translated_train_examples, fopen)

In [15]:
with open('true-case-parasci-arxiv-train.json') as fopen:
    data = json.load(fopen)
    
data[0]

['We find the optimal alignment of the original manifold and the Oose manifold via Procrustes analysis and apply the resulting Translational, Rotational, and Scaling components on the Oose manifold.',
 'We find the optimal alignment of the clean and noisy Embeddings via Procrustes analysis and apply the resulting Translational, Rotational, and Scaling components on the Oose points.']

In [16]:
translated_train_examples = []
for i in tqdm(range(len(data))):
    batch = [b for b in data[i] if len(b.split()) <= 256]
    t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
    inputs = tokenizer(t, return_tensors="pt", padding = True)
    for k in inputs.keys():
        inputs[k] = inputs[k].cuda()

    translated_tokens = model.generate(**inputs, max_length=500)
    decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    
    translated_train_examples.append(
        {
            'en': data[i],
            'ms': decoded,
        }
    )

 71%|██████████████████████         | 220862/309834 [5:01:48<1:46:23, 13.94it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [17]:
with open('parasci-arxiv-train.json', 'w') as fopen:
    json.dump(translated_train_examples, fopen)