<a href="https://colab.research.google.com/github/nonotoy/poysuwop/blob/main/Poysuwop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Poysuwop / Translation library

## 0: Library install

In [None]:
# Library
import glob
import json
import re
import collections
import os
import sys
import warnings
from datetime import datetime

import pandas as pd
import torch
from torch.cuda.amp import GradScaler
from torch.utils.data import Dataset, DataLoader
from transformers import (
    pipeline,
    MBartForConditionalGeneration,
    MBart50Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    RobertaTokenizerFast,
    GenerationConfig,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments
)
from transformers import AdamW, get_linear_schedule_with_warmup
from tokenizers import ByteLevelBPETokenizer

from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
import sacrebleu
import MeCab
import gc

from googletrans import Translator
from tenacity import retry, stop_after_attempt, wait_exponential

## 1: Finetuning

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, df, src_lang, tgt_lang, tokenizer):
        self.df = df
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        source_text = self.df.iloc[idx][self.src_lang]
        target_text = self.df.iloc[idx][self.tgt_lang]

        if not isinstance(source_text, str):
            source_text = str(source_text)

        if not isinstance(target_text, str):
            target_text = str(target_text)

        source = self.tokenizer(source_text, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
        target = self.tokenizer(target_text, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
        source["labels"] = target["input_ids"]

        return source


def create_finetuned_model(model, train_df, test_df, src_lang, tgt_lang, tokenizer, save_path):

    # assign GPU to the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    if tokenizer.pad_token_id is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

    model.config.pad_token_id = tokenizer.pad_token_id

    # Convert df to string
    train_df[src_lang] = train_df[src_lang].astype(str)
    train_df[tgt_lang] = train_df[tgt_lang].astype(str)
    test_df[src_lang] = test_df[src_lang].astype(str)
    test_df[tgt_lang] = test_df[tgt_lang].astype(str)

    # Datasets
    train_dataset = TranslationDataset(train_df, src_lang, tgt_lang, tokenizer)
    eval_dataset = TranslationDataset(test_df, src_lang, tgt_lang, tokenizer)

    # DataLoader
    train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, pin_memory=True)
    eval_dataloader = DataLoader(eval_dataset, batch_size=4, shuffle=False, pin_memory=True)

    # Params
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_dataloader) * 30
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    model.train()
    previous_eval_loss = float('inf')

    for epoch in range(30):
        for batch in train_dataloader:
            optimizer.zero_grad()
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

        # Validation step
        model.eval()
        eval_loss = 0
        with torch.no_grad():
            for batch in eval_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                eval_loss += outputs.loss.item()

        eval_loss /= len(eval_dataloader)
        print(f"Epoch {epoch+1} completed: Validation Loss: {eval_loss}")
        model.train()

        # Early stopping logic
        if epoch > 2 and eval_loss >= previous_eval_loss:
            print("Early stopping triggered.")
            break

        previous_eval_loss = eval_loss

    # Save model & tokenizer
    model.save_pretrained(save_path)

    if src_lang == 'ain':
        tokenizer.save_pretrained(save_path)

    # Release memory and cache
    gc.collect()
    torch.cuda.empty_cache()

    print('Done')


# Tokenize function
def tokenize(texts, tokenizer):
    inputs = texts['source']
    targets = texts['target']

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

## 2: Translate

In [None]:
def translate(text, src_lang, forward_model, forward_tokenizer, tgt_lang, backward_model, backward_tokenizer):

    if src_lang == 'ain':
        model = forward_model
        tokenizer = forward_tokenizer

    elif src_lang == 'jpn':
        model = backward_model
        tokenizer = backward_tokenizer

    else:
        raise ValueError("src_lang must be 'ain' or 'jpn'.")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True, padding=True).to(device)

    # Translate
    translated_tokens = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True, decoder_start_token_id=model.config.decoder_start_token_id)

    # Decode
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

    return translated_text


def translate_batch(texts, model, tokenizer):

    if not isinstance(texts, list):
        raise TypeError("texts must be a list of strings")
    for text in texts:
        if not isinstance(text, str):
            raise TypeError("Each element in texts must be a string")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Enable Gradient Checkpointing: 純伝播時の計算結果は消去して、逆伝播時に勾配を再計算するようにする
    model.gradient_checkpointing_enable()

    # Tokenize
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=1024,
        add_special_tokens=False
    ).to(device)

    # Translate
    with torch.amp.autocast('cuda'):
        translated_tokens = model.generate(
            **inputs,
            max_length=1024,
            num_beams=4,
            early_stopping=True,
            decoder_start_token_id=model.config.decoder_start_token_id
        )

    # Decode
    translated_texts = tokenizer.batch_decode(
        translated_tokens,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
        errors='ignore')

    return translated_texts

## 3: Cyclic translate

In [None]:
# ain-jpn-ain
def cyclic_translate(df, src_lang, forward_model, forward_tokenizer, backward_model, backward_tokenizer, batch_size=32):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    forward_model.to(device)
    backward_model.to(device)

    # Set lists from corpus dataframe
    texts_no = df['no.'].tolist()
    src_texts = df[src_lang].tolist()
    src_texts = [str(text) for text in src_texts]

    translated_texts = []
    backtranslated_texts = []

    batch_counter = 0

    for i in range(0, len(src_texts), batch_size):

        if batch_counter % (1000 // batch_size) == 0 or i + batch_size >= len(src_texts):
            print(f"Processing batch {i // batch_size} / {len(src_texts)}")

        batch_counter += 1

        batch_texts = src_texts[i:i+batch_size]

        # Translate from ain to jpn
        translated_batch = translate_batch(batch_texts, forward_model, forward_tokenizer)

        # Back translate from jpn to ain
        backtranslated_batch = translate_batch(translated_batch, backward_model, backward_tokenizer)

        translated_texts.extend(translated_batch)
        backtranslated_texts.extend(backtranslated_batch)

    # add to df
    df['translated_jpn'] = translated_texts
    df['backtranslated_ain'] = backtranslated_texts

    # Release memory and cache
    gc.collect()
    torch.cuda.empty_cache()

    return df

In [None]:
# Retry up to 3 times with exponential backoff
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def translate_with_retry(text, src, dest):
    translator = Translator()
    return translator.translate(text, src=src, dest=dest)

# jpn-eng-jpn
def cyclic_google_translate(df):

    for i, row in df.iterrows():
        original = row['jpn']

        translated = translate_with_retry(original, src='ja', dest='en')
        back_translated = translate_with_retry(translated.text, src='en', dest='ja')

        df.loc[i, 'translated_eng'] = translated.text
        df.loc[i, 'backtranslated_jpn'] = back_translated.text

        if i % 5000 == 0 or i == len(df):
            print(i, '/', len(df))

    # Release memory and cache
    gc.collect()
    torch.cuda.empty_cache()

    print('done')

    return df

## 4: Evaluation

In [None]:
def evaluate(df):
    references = df['ain'].astype(str).tolist()
    translations = df['backtranslated_ain'].astype(str).tolist()

    # BLEU
    bleu = sacrebleu.corpus_bleu(translations, [references])

    # chrF
    chrf = sacrebleu.corpus_chrf(translations, [references])

    # TER
    ter = sacrebleu.corpus_ter(translations, [references])

    score = {
        'BLEU': bleu.score,
        'chrF': chrf.score,
        'TER': ter.score
    }

    return score

## 5: Merge backtranslated text to original df

In [None]:
def merge_backtranslated_text(df, original_df, cycle, dataset):

    if dataset == 'ain':

        if 'backtranslated_ain' not in df.columns or 'translated_jpn' not in df.columns:
            return df

        df_tmp = df[[
            'no.',
            #'ain', # Original sentence
            'translated_jpn',
            'backtranslated_ain'
        ]].copy()

        # change column name
        #df_tmp.columns = ['no.', 'ain', 'jpn', 'ain-bktr']
        df_tmp.columns = ['no.', 'jpn', 'ain']

        # add column backtranslated count to temp df with value '0'
        df_tmp['src_backtranslated_cycles'] = str(cycle)

        # add column 'backtranslated' to the original df with value '0'
        original_df['src_backtranslated_cycles'] = '0'

    elif dataset == 'jpn':

        if 'backtranslated_jpn' not in df.columns or 'translated_eng' not in df.columns:
            return df

        df_tmp = df[[
            'no.',
            'jpn', # Original sentence
            'translated_eng',
            'backtranslated_jpn'
        ]].copy()

        # change column name
        df_tmp.columns = ['no.', 'jpn', 'eng', 'jpn-bktr']

        # add column backtranslated count to temp df with value '0'
        df_tmp['tgt_backtranslated_cycles'] = str(cycle)

        # add column 'backtranslated' to the original df with value '0'
        original_df['tgt_backtranslated_cycles'] = '0'

    else:
        raise ValueError("dataset must be 'ain' or 'jpn'.")

    # sentence number reformat: padding with 0 '########'
    df_tmp['no.'] = df_tmp['no.'].astype(str).apply(lambda x: x.zfill(8))

    # Add to the original df
    df = pd.concat([original_df, df_tmp], ignore_index=True)

    return df

## 6: Create Path

In [None]:
def create_path(csv_prefix, direction, cycle):

    src_lang = 'ain'
    tgt_lang = 'jpn'

    # Models
    forward_model_path = f'models/{csv_prefix}/{direction}_{src_lang}-{tgt_lang}_c{cycle}'
    backward_model_path = f'models/{csv_prefix}/{direction}_{tgt_lang}-{src_lang}_c{cycle}'

    # csv Files
    train_file = f'temp/{csv_prefix}/{direction}_c{cycle}_train.txt'
    test_file = f'temp/{csv_prefix}/{direction}_c{cycle}_test.txt'
    result_file = f'translation_result/{csv_prefix}/{direction}_c{cycle}.txt'

    return forward_model_path, backward_model_path, train_file, test_file, result_file