In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/finetune-translation-t5-small-standard-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained('mesolitica/finetune-translation-t5-small-standard-bahasa-cased')

In [3]:
import json

In [4]:
with open('true-case-parasci-test.json') as fopen:
    data = json.load(fopen)
    
data[0]

['We run Mecab on Hadoop 11, an open source software that implemented the Map-Reduce framework, for word Segmenting and Pos tagging the data.',
 'We run Mecab 4 with Ipa dictionary 5 on Hadoop 6, an open source software that implemented the Map-Reduce framework, for parallel word Segmenting, Part-Of-Speech tagging, and Kana Pronunciation Annotating.']

In [5]:
_ = model.cuda()

In [6]:
batch = [b for b in data[0] if len(b.split()) <= 100]
batch

['We run Mecab on Hadoop 11, an open source software that implemented the Map-Reduce framework, for word Segmenting and Pos tagging the data.',
 'We run Mecab 4 with Ipa dictionary 5 on Hadoop 6, an open source software that implemented the Map-Reduce framework, for parallel word Segmenting, Part-Of-Speech tagging, and Kana Pronunciation Annotating.']

In [7]:
t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
inputs = tokenizer(t, return_tensors="pt", padding = True)
for k in inputs.keys():
    inputs[k] = inputs[k].cuda()

translated_tokens = model.generate(**inputs, max_length=500)
decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
decoded

['Kami menjalankan Mecab pada Hadoop 11, perisian sumber terbuka yang melaksanakan rangka kerja Map-Reduce, untuk Segmen perkataan dan Pos penandaan data.',
 'Kami menjalankan Mecab 4 dengan kamus Ipa 5 pada Hadoop 6, perisian sumber terbuka yang melaksanakan rangka kerja Map-Reduce, untuk Segmenting perkataan selari, penandaan Part-Of-Speech, dan Kana Pronunciation Annotating.']

In [8]:
from tqdm import tqdm

translated_train_examples = []
for i in tqdm(range(len(data))):
    batch = [b for b in data[i] if len(b.split()) <= 256]
    t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
    inputs = tokenizer(t, return_tensors="pt", padding = True)
    for k in inputs.keys():
        inputs[k] = inputs[k].cuda()

    translated_tokens = model.generate(**inputs, max_length=500)
    decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    
    translated_train_examples.append(
        {
            'en': data[i],
            'ms': decoded,
        }
    )

100%|███████████████████████████████████████| 2345/2345 [02:55<00:00, 13.38it/s]


In [9]:
translated_train_examples[0]

{'en': ['We run Mecab on Hadoop 11, an open source software that implemented the Map-Reduce framework, for word Segmenting and Pos tagging the data.',
  'We run Mecab 4 with Ipa dictionary 5 on Hadoop 6, an open source software that implemented the Map-Reduce framework, for parallel word Segmenting, Part-Of-Speech tagging, and Kana Pronunciation Annotating.'],
 'ms': ['Kami menjalankan Mecab pada Hadoop 11, perisian sumber terbuka yang melaksanakan rangka kerja Map-Reduce, untuk Segmen perkataan dan Pos penandaan data.',
  'Kami menjalankan Mecab 4 dengan kamus Ipa 5 pada Hadoop 6, perisian sumber terbuka yang melaksanakan rangka kerja Map-Reduce, untuk Segmenting perkataan selari, penandaan Part-Of-Speech, dan Kana Pronunciation Annotating.']}

In [10]:
import json

with open('parasci-test.json', 'w') as fopen:
    json.dump(translated_train_examples, fopen)

In [12]:
with open('true-case-parasci-val.json') as fopen:
    data = json.load(fopen)
    
data[0]

['Belz and Kow proposed another Smt based Nlg system which made use of the Phrase-Based Smt model.',
 'The Phraseextraction Heuristics of were used to build the Phrase-Based Smt systems.']

In [13]:
translated_train_examples = []
for i in tqdm(range(len(data))):
    batch = [b for b in data[i] if len(b.split()) <= 256]
    t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
    inputs = tokenizer(t, return_tensors="pt", padding = True)
    for k in inputs.keys():
        inputs[k] = inputs[k].cuda()

    translated_tokens = model.generate(**inputs, max_length=500)
    decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    
    translated_train_examples.append(
        {
            'en': data[i],
            'ms': decoded,
        }
    )

100%|███████████████████████████████████████| 2753/2753 [03:22<00:00, 13.56it/s]


In [14]:
with open('parasci-val.json', 'w') as fopen:
    json.dump(translated_train_examples, fopen)

In [15]:
with open('true-case-parasci-train.json') as fopen:
    data = json.load(fopen)
    
data[0]

['The Tweets were Tokenized and Part-Ofspeech tagged with the Cmu Ark Twitter Nlp tool and Stanford Corenlp.',
 'For all methods, the Tweets were Tokenized with the Cmu Twitter Nlp tool.']

In [16]:
translated_train_examples = []
for i in tqdm(range(len(data))):
    batch = [b for b in data[i] if len(b.split()) <= 256]
    t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
    inputs = tokenizer(t, return_tensors="pt", padding = True)
    for k in inputs.keys():
        inputs[k] = inputs[k].cuda()

    translated_tokens = model.generate(**inputs, max_length=500)
    decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    
    translated_train_examples.append(
        {
            'en': data[i],
            'ms': decoded,
        }
    )

100%|█████████████████████████████████████| 28883/28883 [35:23<00:00, 13.60it/s]


In [17]:
with open('parasci-train.json', 'w') as fopen:
    json.dump(translated_train_examples, fopen)