In [1]:
!pip -q install evaluate sacrebleu sentencepiece

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os, time, json
import pandas as pd
import numpy as np
import torch
import evaluate
from datasets import Dataset

from transformers import (
    MBart50TokenizerFast,
    MBartForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

In [33]:
SEED = 42
MAX_CHARS = 500
MAX_SRC_LEN = 80
MAX_TGT_LEN = 80

In [34]:
GEN_MAX_LEN = 56
NUM_BEAMS = 1  # greedy for speed + consistency

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device =", device)

device = cuda


In [6]:
BASE_DIR = "/content/drive/MyDrive/dataset_splits_opus100_10k"
PAIR_DIRS = {
    "en_ko": f"{BASE_DIR}/en_ko",
    "en_id": f"{BASE_DIR}/en_id",
    "en_vi": f"{BASE_DIR}/en_vi",
}

In [7]:
def load_pair_split(pair_folder, split):
    path = f"{pair_folder}/{split}.csv"
    df = pd.read_csv(path).dropna()

    # whitespace cleaning
    df["source"] = df["source"].astype(str).str.strip()
    df["target"] = df["target"].astype(str).str.strip()

    # remove empty samples
    df = df[(df["source"] != "") & (df["target"] != "")]

    # max characters per sentence
    df = df[(df["source"].str.len() <= MAX_CHARS) & (df["target"].str.len() <= MAX_CHARS)]
    return df

In [8]:
def make_bidir(df, src_code, tgt_code):
    forward = pd.DataFrame({
        "src_text": df["source"],
        "tgt_text": df["target"],
        "src_lang": src_code,
        "tgt_lang": tgt_code
    })
    backward = pd.DataFrame({
        "src_text": df["target"],
        "tgt_text": df["source"],
        "src_lang": tgt_code,
        "tgt_lang": src_code
    })
    return pd.concat([forward, backward], ignore_index=True)

In [9]:
def build_all_splits():
    train_parts, val_parts, test_parts = [], [], []

    # en-ko
    train_ko = load_pair_split(PAIR_DIRS["en_ko"], "train")
    val_ko   = load_pair_split(PAIR_DIRS["en_ko"], "val")
    test_ko  = load_pair_split(PAIR_DIRS["en_ko"], "test")
    train_parts.append(make_bidir(train_ko, "en_XX", "ko_KR"))
    val_parts.append(make_bidir(val_ko, "en_XX", "ko_KR"))
    test_parts.append(make_bidir(test_ko, "en_XX", "ko_KR"))

    # en-id
    train_id = load_pair_split(PAIR_DIRS["en_id"], "train")
    val_id   = load_pair_split(PAIR_DIRS["en_id"], "val")
    test_id  = load_pair_split(PAIR_DIRS["en_id"], "test")
    train_parts.append(make_bidir(train_id, "en_XX", "id_ID"))
    val_parts.append(make_bidir(val_id, "en_XX", "id_ID"))
    test_parts.append(make_bidir(test_id, "en_XX", "id_ID"))

    # en-vi
    train_vi = load_pair_split(PAIR_DIRS["en_vi"], "train")
    val_vi   = load_pair_split(PAIR_DIRS["en_vi"], "val")
    test_vi  = load_pair_split(PAIR_DIRS["en_vi"], "test")
    train_parts.append(make_bidir(train_vi, "en_XX", "vi_VN"))
    val_parts.append(make_bidir(val_vi, "en_XX", "vi_VN"))
    test_parts.append(make_bidir(test_vi, "en_XX", "vi_VN"))

    train_all = pd.concat(train_parts, ignore_index=True)
    val_all   = pd.concat(val_parts, ignore_index=True)
    test_all  = pd.concat(test_parts, ignore_index=True)

    return train_all, val_all, test_all

In [10]:
train_all, val_all, test_all = build_all_splits()

print("Train:", train_all.shape, "Val:", val_all.shape, "Test:", test_all.shape)
print("Train tgt_lang counts:\n", train_all["tgt_lang"].value_counts())

hf_train = Dataset.from_pandas(train_all.reset_index(drop=True))
hf_val   = Dataset.from_pandas(val_all.reset_index(drop=True))
hf_test  = Dataset.from_pandas(test_all.reset_index(drop=True))

Train: (48000, 4) Val: (6000, 4) Test: (6000, 4)
Train tgt_lang counts:
 tgt_lang
en_XX    24000
ko_KR     8000
id_ID     8000
vi_VN     8000
Name: count, dtype: int64


In [11]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tok = MBart50TokenizerFast.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [12]:
def preprocess(ex):
    tok.src_lang = ex["src_lang"]
    model_in = tok(ex["src_text"], max_length=MAX_SRC_LEN, truncation=True)
    labels = tok(text_target=ex["tgt_text"], max_length=MAX_TGT_LEN, truncation=True)
    model_in["labels"] = labels["input_ids"]
    return model_in

In [13]:
tokenized_train = hf_train.map(preprocess, remove_columns=hf_train.column_names)
tokenized_val   = hf_val.map(preprocess, remove_columns=hf_val.column_names)
tokenized_test  = hf_test.map(preprocess, remove_columns=hf_test.column_names)

Map:   0%|          | 0/48000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [14]:
TRAIN_N = 20000
VAL_N   = 2000
TEST_N  = 2000

In [15]:
tokenized_train_small = tokenized_train.select(range(min(TRAIN_N, len(tokenized_train))))
tokenized_val_small   = tokenized_val.select(range(min(VAL_N, len(tokenized_val))))
tokenized_test_small  = tokenized_test.select(range(min(TEST_N, len(tokenized_test))))

In [16]:
hf_test_small = hf_test.select(range(min(TEST_N, len(hf_test))))

In [17]:
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)
model.gradient_checkpointing_enable()
model.config.use_cache = False

data_collator = DataCollatorForSeq2Seq(tokenizer=tok, model=model)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/516 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [18]:
bleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [19]:
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tok.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tok.pad_token_id)
    decoded_labels = tok.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]
    refs = [[l] for l in decoded_labels]

    return {
        "bleu": bleu.compute(predictions=decoded_preds, references=refs)["score"],
        "chrf": chrf.compute(predictions=decoded_preds, references=refs)["score"],
    }

In [20]:
def eval_direction_scores(model_obj, dataset_obj, src_code, tgt_code,
                          max_n=500, batch_size=8,
                          num_beams=NUM_BEAMS, gen_max_len=GEN_MAX_LEN):
    subset = dataset_obj.filter(lambda x: x["src_lang"] == src_code and x["tgt_lang"] == tgt_code)
    subset = subset.select(range(min(max_n, len(subset))))

    if len(subset) == 0:
        return {"pair": f"{src_code}->{tgt_code}", "n": 0, "bleu": None, "chrf": None, "sents_per_sec": None}

    tok.src_lang = src_code
    forced_id = tok.lang_code_to_id[tgt_code]

    preds, refs = [], []
    model_obj.eval()

    start = time.time()
    for i in range(0, len(subset), batch_size):
        batch = subset[i:i+batch_size]
        inputs = tok(
            batch["src_text"],
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_SRC_LEN
        ).to(model_obj.device)

        with torch.no_grad():
            gen = model_obj.generate(
                **inputs,
                forced_bos_token_id=forced_id,
                num_beams=num_beams,
                max_length=gen_max_len
            )

        preds.extend(tok.batch_decode(gen, skip_special_tokens=True))
        refs.extend(batch["tgt_text"])
    elapsed = max(1e-9, time.time() - start)

    preds = [p.strip() for p in preds]
    refs  = [[r.strip()] for r in refs]

    return {
        "pair": f"{src_code}->{tgt_code}",
        "n": len(subset),
        "bleu": bleu.compute(predictions=preds, references=refs)["score"],
        "chrf": chrf.compute(predictions=preds, references=refs)["score"],
        "sents_per_sec": len(subset) / elapsed
    }

In [21]:
pairs = [
    ("en_XX","ko_KR"), ("ko_KR","en_XX"),
    ("en_XX","id_ID"), ("id_ID","en_XX"),
    ("en_XX","vi_VN"), ("vi_VN","en_XX"),
]

In [28]:
BASELINE_OUT = "/content/drive/MyDrive/mbart50_baseline_per_direction.csv"

baseline_rows = [eval_direction_scores(model, hf_test, s, t, max_n=1000) for s, t in pairs]
baseline_df = pd.DataFrame(baseline_rows).sort_values("pair").reset_index(drop=True)
print("=== BASELINE per direction (TEST) ===")
display(baseline_df)
baseline_df.to_csv(BASELINE_OUT, index=False)
print("Saved:", BASELINE_OUT)

Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

=== BASELINE per direction (TEST) ===


Unnamed: 0,pair,n,bleu,chrf,sents_per_sec
0,en_XX->id_ID,1000,19.45452,44.253481,12.235795
1,en_XX->ko_KR,1000,1.574985,10.95345,7.407268
2,en_XX->vi_VN,1000,18.663508,35.906243,10.234926
3,id_ID->en_XX,1000,26.392685,43.621882,10.308231
4,ko_KR->en_XX,1000,10.998561,28.333688,10.781839
5,vi_VN->en_XX,1000,22.19144,40.231251,11.643287


Saved: /content/drive/MyDrive/mbart50_baseline_per_direction.csv


In [37]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/mbart50_output",

    eval_strategy="steps",
    eval_steps=2000,

    save_strategy="steps",
    save_steps=2000,
    save_total_limit=2,

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=1,

    predict_with_generate=False,
    generation_max_length=GEN_MAX_LEN,
    generation_num_beams=NUM_BEAMS,

    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,
    logging_steps=50,
    report_to="none",
)

In [38]:
import gc, torch
gc.collect()
torch.cuda.empty_cache()

In [39]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_small,
    eval_dataset=tokenized_val_small,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=tok
)

train_start = time.time()
trainer.train()
train_time_sec = time.time() - train_start

Step,Training Loss,Validation Loss


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

In [40]:
# Save best/final model
save_dir = "/content/drive/MyDrive/mbart50_finetuned"
trainer.save_model(save_dir)
tok.save_pretrained(save_dir)
print("Saved to:", save_dir)

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Saved to: /content/drive/MyDrive/mbart50_finetuned


In [41]:
# Model size (MB)
model_path = os.path.join(save_dir, "model.safetensors")
if os.path.exists(model_path):
    model_size_mb = os.path.getsize(model_path) / (1024**2)
else:
    # fallback: directory size
    total = 0
    for root, _, files in os.walk(save_dir):
        for f in files:
            total += os.path.getsize(os.path.join(root, f))
    model_size_mb = total / (1024**2)

print(f"Training time (sec): {train_time_sec:.2f}")
print(f"Model size (MB): {model_size_mb:.2f}")

Training time (sec): 4871.16
Model size (MB): 2331.33


In [42]:
AFTER_OUT = "/content/drive/MyDrive/mbart50_after_per_direction.csv"
COMPARE_OUT = "/content/drive/MyDrive/mbart50_before_after_per_direction.csv"

In [43]:
ft_model = trainer.model.to(device)

In [44]:
after_rows = [eval_direction_scores(ft_model, hf_test, s, t, max_n=1000) for s, t in pairs]
after_df = pd.DataFrame(after_rows).sort_values("pair").reset_index(drop=True)
print("=== AFTER finetune per direction (TEST) ===")
display(after_df)
after_df.to_csv(AFTER_OUT, index=False)
print("Saved:", AFTER_OUT)

Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

=== AFTER finetune per direction (TEST) ===


Unnamed: 0,pair,n,bleu,chrf,sents_per_sec
0,en_XX->id_ID,1000,22.903741,47.398664,14.493907
1,en_XX->ko_KR,1000,2.843403,12.18443,10.292964
2,en_XX->vi_VN,1000,12.597778,29.362078,6.683987
3,id_ID->en_XX,1000,26.929461,44.543228,12.454038
4,ko_KR->en_XX,1000,11.688947,29.765066,11.921854
5,vi_VN->en_XX,1000,22.526677,40.415974,13.263083


Saved: /content/drive/MyDrive/mbart50_after_per_direction.csv


In [45]:
final_df = baseline_df.merge(after_df, on="pair", suffixes=("_before", "_after"))
final_df["bleu_gain"] = final_df["bleu_after"] - final_df["bleu_before"]
final_df["chrf_gain"] = final_df["chrf_after"] - final_df["chrf_before"]

In [46]:
# Add run stats columns (same for all rows)
final_df["training_time_sec"] = train_time_sec
final_df["model_size_mb"] = model_size_mb

print("=== BEFORE vs AFTER (per direction) ===")
display(final_df)
final_df.to_csv(COMPARE_OUT, index=False)
print("Saved:", COMPARE_OUT)

=== BEFORE vs AFTER (per direction) ===


Unnamed: 0,pair,n_before,bleu_before,chrf_before,sents_per_sec_before,n_after,bleu_after,chrf_after,sents_per_sec_after,bleu_gain,chrf_gain,training_time_sec,model_size_mb
0,en_XX->id_ID,1000,19.45452,44.253481,12.235795,1000,22.903741,47.398664,14.493907,3.449222,3.145184,4871.158945,2331.331909
1,en_XX->ko_KR,1000,1.574985,10.95345,7.407268,1000,2.843403,12.18443,10.292964,1.268417,1.23098,4871.158945,2331.331909
2,en_XX->vi_VN,1000,18.663508,35.906243,10.234926,1000,12.597778,29.362078,6.683987,-6.06573,-6.544165,4871.158945,2331.331909
3,id_ID->en_XX,1000,26.392685,43.621882,10.308231,1000,26.929461,44.543228,12.454038,0.536775,0.921345,4871.158945,2331.331909
4,ko_KR->en_XX,1000,10.998561,28.333688,10.781839,1000,11.688947,29.765066,11.921854,0.690387,1.431378,4871.158945,2331.331909
5,vi_VN->en_XX,1000,22.19144,40.231251,11.643287,1000,22.526677,40.415974,13.263083,0.335237,0.184723,4871.158945,2331.331909


Saved: /content/drive/MyDrive/mbart50_before_after_per_direction.csv


In [49]:
def show_examples(model_obj, dataset_obj, src_code, tgt_code, k=3):
    # filter direction
    subset = dataset_obj.filter(lambda x: x["src_lang"] == src_code and x["tgt_lang"] == tgt_code)

    # remove non-string / empty samples (robust)
    def ok(x):
        s = x.get("src_text", None)
        t = x.get("tgt_text", None)
        if s is None or t is None:
            return False
        if not isinstance(s, str) or not isinstance(t, str):
            return False
        s = s.strip()
        t = t.strip()
        return (s != "") and (t != "")

    subset = subset.filter(ok)
    subset = subset.select(range(min(len(subset), k)))

    if len(subset) == 0:
        print(f"No valid samples for {src_code}->{tgt_code}")
        return

    tok.src_lang = src_code
    forced_id = tok.lang_code_to_id[tgt_code]

    # ✅ ensure list[str]
    src_texts = [str(x).strip() for x in subset["src_text"]]

    inputs = tok(
        src_texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_SRC_LEN
    ).to(model_obj.device)

    with torch.no_grad():
        gen = model_obj.generate(
            **inputs,
            forced_bos_token_id=forced_id,
            num_beams=NUM_BEAMS,
            max_length=GEN_MAX_LEN
        )

    preds = tok.batch_decode(gen, skip_special_tokens=True)

    print(f"\n=== Examples {src_code}->{tgt_code} ===")
    for i in range(len(subset)):
        print("SRC :", subset["src_text"][i])
        print("REF :", subset["tgt_text"][i])
        print("OUT :", preds[i])
        print("---")

In [51]:
# Baseline examples
for s, t in pairs:
    show_examples(model, hf_test, s, t, k=2)

# After fine-tune examples
for s, t in pairs:
    show_examples(ft_model, hf_test, s, t, k=2)

Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]


=== Examples en_XX->ko_KR ===
SRC : Are you seeing anyone?
REF : 만나는 사람 있어요?
OUT : 누구라도 봐요?
---
SRC : Karev, you're with me and the donor. Usually we take them at the same time, but donor's life support is fading.
REF : 카레브, 나와 함께 기증자를 맡아 보통은 동시에 진행하지만
OUT : 카레브, 저와 기증자와 함께 하시죠.
---


Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]


=== Examples ko_KR->en_XX ===
SRC : 만나는 사람 있어요?
REF : Are you seeing anyone?
OUT : Anybody you want to meet?
---
SRC : 카레브, 나와 함께 기증자를 맡아 보통은 동시에 진행하지만
REF : Karev, you're with me and the donor. Usually we take them at the same time, but donor's life support is fading.
OUT : Carve, you're with me, you're with the donors, usually at the same time, but
---


Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]


=== Examples en_XX->id_ID ===
SRC : You mean I shall end.
REF : Maksudmu aku akan berakhir.
OUT : Maksudmu aku akan mengakhiri.
---
SRC : I'm a police officer
REF : Aku opsir.
OUT : Saya seorang polisi.
---


Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]


=== Examples id_ID->en_XX ===
SRC : Maksudmu aku akan berakhir.
REF : You mean I shall end.
OUT : You mean I'm going to end.
---
SRC : Aku opsir.
REF : I'm a police officer
OUT : I hate it.
---


Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]


=== Examples en_XX->vi_VN ===
SRC : Yeah, I remember.
REF : Ừ, tao nhớ rồi.
OUT : Yeah, aku nhớ.
---
SRC : - Got your breath back?
REF : - Lấy lại sức chưa?
OUT : - . - . - . - . - . - . - . - . - . - . - . - . - . - . - . - . - . - 
---


Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]


=== Examples vi_VN->en_XX ===
SRC : Ừ, tao nhớ rồi.
REF : Yeah, I remember.
OUT : Yeah, I remember.
---
SRC : - Lấy lại sức chưa?
REF : - Got your breath back?
OUT : - You're back?
---


Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]


=== Examples en_XX->ko_KR ===
SRC : Are you seeing anyone?
REF : 만나는 사람 있어요?
OUT : 누구라도 봐요?
---
SRC : Karev, you're with me and the donor. Usually we take them at the same time, but donor's life support is fading.
REF : 카레브, 나와 함께 기증자를 맡아 보통은 동시에 진행하지만
OUT : 카레브, 저와 기증자와 함께 하시죠.
---


Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]


=== Examples ko_KR->en_XX ===
SRC : 만나는 사람 있어요?
REF : Are you seeing anyone?
OUT : Anybody you want to meet?
---
SRC : 카레브, 나와 함께 기증자를 맡아 보통은 동시에 진행하지만
REF : Karev, you're with me and the donor. Usually we take them at the same time, but donor's life support is fading.
OUT : Carve, you're with me, you're with the donors, usually at the same time, but
---


Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]


=== Examples en_XX->id_ID ===
SRC : You mean I shall end.
REF : Maksudmu aku akan berakhir.
OUT : Maksudmu aku akan mengakhiri.
---
SRC : I'm a police officer
REF : Aku opsir.
OUT : Saya seorang polisi.
---


Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]


=== Examples id_ID->en_XX ===
SRC : Maksudmu aku akan berakhir.
REF : You mean I shall end.
OUT : You mean I'm going to end.
---
SRC : Aku opsir.
REF : I'm a police officer
OUT : I hate it.
---


Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]


=== Examples en_XX->vi_VN ===
SRC : Yeah, I remember.
REF : Ừ, tao nhớ rồi.
OUT : Yeah, aku nhớ.
---
SRC : - Got your breath back?
REF : - Lấy lại sức chưa?
OUT : - . - . - . - . - . - . - . - . - . - . - . - . - . - . - . - . - . - 
---


Filter:   0%|          | 0/6000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]


=== Examples vi_VN->en_XX ===
SRC : Ừ, tao nhớ rồi.
REF : Yeah, I remember.
OUT : Yeah, I remember.
---
SRC : - Lấy lại sức chưa?
REF : - Got your breath back?
OUT : - You're back?
---
