In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
# !wget https://storage.googleapis.com/paws/english/paws_wiki_labeled_final.tar.gz
# !tar -xzf paws_wiki_labeled_final.tar.gz

In [3]:
import csv

train_examples = []
test_examples = []
dev_examples = []

with open("final/train.tsv","r") as csvfile:

  reader = csv.reader(csvfile,delimiter="\t")
  
  next(reader)

  for row in reader:

    if row[3] == "1":
      train_examples.append((row[1],row[2]))



with open("final/test.tsv","r") as csvfile:

  reader = csv.reader(csvfile,delimiter="\t")
  
  next(reader)

  for row in reader:

    if row[3] == "1":
      test_examples.append((row[1],row[2]))


with open("final/dev.tsv","r") as csvfile:

  reader = csv.reader(csvfile,delimiter="\t")
  
  next(reader)

  for row in reader:

    if row[3] == "1":
      dev_examples.append((row[1],row[2]))

In [5]:
len(train_examples), len(test_examples), len(dev_examples)

(21829, 3536, 3539)

In [6]:
train_examples[0]

('The NBA season of 1975 -- 76 was the 30th season of the National Basketball Association .',
 'The 1975 -- 76 season of the National Basketball Association was the 30th season of the NBA .')

In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/finetune-translation-t5-small-standard-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained('mesolitica/finetune-translation-t5-small-standard-bahasa-cased')

In [8]:
_ = model.cuda()

In [9]:
batch = [b for b in train_examples[0] if len(b.split()) <= 100]
batch

['The NBA season of 1975 -- 76 was the 30th season of the National Basketball Association .',
 'The 1975 -- 76 season of the National Basketball Association was the 30th season of the NBA .']

In [11]:
t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
inputs = tokenizer(t, return_tensors="pt", padding = True)
for k in inputs.keys():
    inputs[k] = inputs[k].cuda()

translated_tokens = model.generate(**inputs, max_length=500)
decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
decoded

['Musim NBA 1975-76 adalah musim ke-30 Persatuan Bola Keranjang Kebangsaan.',
 'Musim 1975-76 Persatuan Bola Keranjang Kebangsaan adalah musim ke-30 NBA.']

In [15]:
from tqdm import tqdm

translated_train_examples = []
for i in tqdm(range(len(train_examples))):
    batch = [b for b in train_examples[i] if len(b.split()) <= 100]
    t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
    inputs = tokenizer(t, return_tensors="pt", padding = True)
    for k in inputs.keys():
        inputs[k] = inputs[k].cuda()

    translated_tokens = model.generate(**inputs, max_length=500)
    decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    
    translated_train_examples.append(
        {
            'en': train_examples[i],
            'ms': decoded,
        }
    )

100%|█████████████████████████████████████| 21829/21829 [48:02<00:00,  7.57it/s]


In [16]:
translated_train_examples[0]

{'en': ('The NBA season of 1975 -- 76 was the 30th season of the National Basketball Association .',
  'The 1975 -- 76 season of the National Basketball Association was the 30th season of the NBA .'),
 'ms': ['Musim NBA 1975-76 adalah musim ke-30 Persatuan Bola Keranjang Kebangsaan.',
  'Musim 1975-76 Persatuan Bola Keranjang Kebangsaan adalah musim ke-30 NBA.']}

In [17]:
translated_test_examples = []
for i in tqdm(range(len(test_examples))):
    batch = [b for b in test_examples[i] if len(b.split()) <= 100]
    t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
    inputs = tokenizer(t, return_tensors="pt", padding = True)
    for k in inputs.keys():
        inputs[k] = inputs[k].cuda()

    translated_tokens = model.generate(**inputs, max_length=500)
    decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    
    translated_test_examples.append(
        {
            'en': test_examples[i],
            'ms': decoded,
        }
    )

100%|███████████████████████████████████████| 3536/3536 [07:20<00:00,  8.03it/s]


In [18]:
translated_dev_examples = []
for i in tqdm(range(len(dev_examples))):
    batch = [b for b in dev_examples[i] if len(b.split()) <= 100]
    t = ['terjemah Inggeris ke Melayu: ' + b for b in batch]
    inputs = tokenizer(t, return_tensors="pt", padding = True)
    for k in inputs.keys():
        inputs[k] = inputs[k].cuda()

    translated_tokens = model.generate(**inputs, max_length=500)
    decoded = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    
    translated_dev_examples.append(
        {
            'en': dev_examples[i],
            'ms': decoded,
        }
    )

100%|███████████████████████████████████████| 3539/3539 [07:22<00:00,  7.99it/s]


In [20]:
translated_train_examples[0]

{'en': ('The NBA season of 1975 -- 76 was the 30th season of the National Basketball Association .',
  'The 1975 -- 76 season of the National Basketball Association was the 30th season of the NBA .'),
 'ms': ['Musim NBA 1975-76 adalah musim ke-30 Persatuan Bola Keranjang Kebangsaan.',
  'Musim 1975-76 Persatuan Bola Keranjang Kebangsaan adalah musim ke-30 NBA.']}

In [21]:
import json

with open('paws-train.json', 'w') as fopen:
    json.dump(translated_train_examples, fopen)

In [22]:
with open('paws-test.json', 'w') as fopen:
    json.dump(translated_test_examples, fopen)

In [23]:
with open('paws-dev.json', 'w') as fopen:
    json.dump(translated_dev_examples, fopen)