In [None]:
! pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece]

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-th-en"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from datasets import load_dataset, load_metric, DatasetDict, Dataset
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [None]:
def pre_process_from_csv(path):
    df_ = pd.read_csv(path)
    n_row = len(df_)
    list_ = df_.to_dict('records')[:n_row]
    list_sub = ['LST_Corpus']*len(list_)
    dict_ = pd.DataFrame({"translation": list_, "subdataset": list_sub})
    return dict_


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
metric = load_metric("sacrebleu")

In [None]:
dict_lst = pre_process_from_csv('/content/drive/MyDrive/SuperAI/Week5_MT/df_train.csv')
dict_valid = pre_process_from_csv('/content/drive/MyDrive/SuperAI/Week5_MT/df_valid.csv')

In [None]:
fraud_th_idx = list()
fraud_en_idx = list()
fraud_list = list()
for idx, r in dict_lst.iterrows():
    try:
        if '"' in r['translation']['th']:
            fraud_th_idx.append(idx)
        if '"' in r['translation']['en']:
            fraud_en_idx.append(idx)
    except:
        fraud_list.append(idx)

In [None]:
fraud_list

[]

In [None]:
dict_lst.drop(index=fraud_list, inplace=True)

In [None]:
dict_lst

Unnamed: 0,translation,subdataset
0,{'th': 'โอ้ คุณ รู้ เมื่อ ฉัน ตื่น ฉัน จะ ออ...,LST_Corpus
1,{'th': 'ในที่สุด ฉัน ก็มี โอกาส ได้ พบ เชอร์ เ...,LST_Corpus
2,{'th': 'การ กิน สินบน โดย ผู้ พิพากษา นั้น อาจ...,LST_Corpus
3,{'th': 'ดูเหมือนว่า ฉัน ลืม นำ ไข่ปลา คาร์ เ ว...,LST_Corpus
4,{'th': 'แบบ อักษร ที่ จะใช้ สำหรับ หัว และ ท้า...,LST_Corpus
...,...,...
103420,{'th': 'ฉัน เสียใจ กล่อง อาหาร กลาง วัน พิเศษ ...,LST_Corpus
103421,{'th': 'เธอ ไม่ ได้ รับ การ ยอมรับ ให้ อยู่ ใน...,LST_Corpus
103422,{'th': 'ถนนหนทาง ใน กรุงเทพมหานคร แน่น แ ด้วย ...,LST_Corpus
103423,{'th': 'ข้อ บกพร่อง ของ วิธี นี้ ก็ คือ การ ขา...,LST_Corpus


In [None]:
datasets = DatasetDict()

tds = Dataset.from_pandas(dict_lst)
vds = Dataset.from_pandas(dict_valid)

datasets['train'] = tds
datasets['validation'] = vds

In [None]:
datasets

DatasetDict({
    train: Dataset({
        features: ['translation', 'subdataset', '__index_level_0__'],
        num_rows: 103425
    })
    validation: Dataset({
        features: ['translation', 'subdataset'],
        num_rows: 1018
    })
})

In [None]:
max_input_len  = 128
max_target_len = 128

prefix= ''
source_lang = 'th'
target_lang = 'en'

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples['translation']]
    targets= [ex[target_lang] for ex in examples['translation']]
    model_inputs = tokenizer(inputs, max_length=max_input_len, truncation=True) # Pad to longest word (128 char)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_len, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
tokenized_datasets = datasets.map(preprocess_function, batched=True)

  0%|          | 0/104 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['translation', 'subdataset', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 103425
    })
    validation: Dataset({
        features: ['translation', 'subdataset', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1018
    })
})

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 64
model_name = model_checkpoint.split("/")[-1]
source_lang = 'TH'
target_lang = 'EN'

args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3, # Only last 5 models are saved. Older ones are deleted.
    num_train_epochs=10,
    predict_with_generate=True    
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Model Training

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset= tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
%time
trainer.train()

# Inference

In [None]:
import os
for dirname, _, filenames in os.walk('opus-mt-th-en-finetuned-TH-to-EN'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
fake_preds_list = list()
fake_labels_list = list()

for idx in range(len(datasets['test'])):

    src_text = ''.join([word.strip() for word in datasets['test'][idx]['translation']['th'].strip()])
    ground_truth_en = datasets['test'][idx]['translation']['en']
    translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True).to('cuda'))
    pred_val = ''.join([tokenizer.decode(t, skip_special_tokens=True) for t in translated])

    fake_labels_list.append(ground_truth_en)
    fake_preds_list.append([tokenizer.decode(t, skip_special_tokens=True) for t in translated])

    print(f'Truth TH: \t{src_text}')
    print(f'Truth EN: \t{ground_truth_en}')
    print(f'Pred EN: \t{pred_val}')
    print('\n')
    

In [None]:
!pip install sacrebleu

In [None]:
fake_preds = ["hello ther", "genera kenobi"]
fake_labels = [["hello there"], ["general kenobi"]]
metric.compute(predictions=fake_preds, references=fake_labels)

In [None]:
!pip install sacrebleu
def cal_bleu(predict, ground_truth):
  """
  this function calculate bleu score between prediction and ground truth
  predict <List> : list of prediction string ["str1","str2","str3", ...]
  ground_truth <List> : list of groundtruth string ["gt1","gt2","gt3", ...]
  """
  from sacrebleu.metrics import BLEU
  bleu = BLEU()
  res = score = bleu.corpus_score(ground_truth, [predict])
  return res

refs = ['The dog bit the man.', 'It was not unexpected.', 'The man bit him first.',]
sys = ['The dog bit the man.', "It wasn't surprising.", 'The man had just bitten him.']

cal_bleu(sys, refs)