<a href="https://colab.research.google.com/github/nonotoy/poysuwop/blob/main/04_Poysuwop_Cyclic_Translation_MBart25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Poysuwop_2 / MBart25

## 1_Initialise

### Library

In [None]:
! pip install transformers[torch] datasets sentencepiece sacremoses sacrebleu mecab-python3 unidic-lite nltk tenacity
! pip install accelerate -U
! pip install googletrans==3.1.0a0
! pip install importnb

In [2]:
# Library
import glob
import json
import re
import collections
import os
import sys

import pandas as pd
import torch
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader
from transformers import (
    pipeline,
    MBartForConditionalGeneration,
    MBart50Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    RobertaTokenizerFast,
    GenerationConfig
)
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
import sacrebleu
import MeCab
import gc

from googletrans import Translator
from tenacity import retry, stop_after_attempt, wait_exponential

os.chdir('/content/drive/MyDrive/Colab Notebooks/Poysuwop')

In [5]:
# Install from Poysuwop.ipynb
from importnb import imports
with imports("ipynb"):
    import Poysuwop as poysuwop

from modules import ainPreprocess

### Dataset

In [3]:
# Language
src_lang = 'ain'
tgt_lang = 'jpn'

# Load dataset
file_path = 'poysuwop_corpus.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

data = [line.strip().split('\t') for line in lines]

line_no, ain_txt, jpn_txt = zip(*[(parts[0], parts[1], parts[2]) for parts in data])

# Cleanse dataset
#{}に挟まれているサケへなどは削除


# Store to df
df = pd.DataFrame({
    'no.': line_no,
    'ain': ain_txt,
    'jpn': jpn_txt
})

original_df = df.copy()

### Tokenizer

In [4]:
# Forward:  Ain -> Jpn
# Backward: Jpn -> Ain

forward_tokenizer = RobertaTokenizerFast.from_pretrained("AinuBERTTokenizer")
backward_tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang='ja_XX')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

### Model

In [None]:
# Setup model
model_name = 'mbart-large-cc25'
model = MBartForConditionalGeneration.from_pretrained(f'facebook/{model_name}')

# csv prefix
csv_prefix = 'MBart25'

## 2: Vanilla

#### Setup

In [None]:
cycle = 0

# Setup model
model_name = 'mbart-large-cc25'
model = MBartForConditionalGeneration.from_pretrained(f'facebook/{model_name}')

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

#### Cyclic Translate

In [None]:
# Cyclic translate
df = poysuwop.cyclic_translate(
    df,
    src_lang,
    model,
    forward_tokenizer,
    model,
    backward_tokenizer
)

# Save csv
#df.to_csv(f'{csv_prefix}_backtranslated_{src_lang}_{cycle}.txt', index=False, encoding='utf-8', sep='\t', escapechar='\\')
df.to_csv(f'{csv_prefix}_backtranslated_{src_lang}_{cycle}.txt', index=False, encoding='utf-8', sep='\t', lineterminator='\n')

In [None]:
'''cycle = 0

model_name = 'mbart-large-cc25'

df = pd.DataFrame()

with open(f'{model_name}_backtranslated_{src_lang}_{cycle}.txt', 'r', encoding='utf-8') as file:
    for line in file:
        row = line.strip().split('\t')
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

# set first row to row name
df.columns = df.iloc[0]
df = df[1:]'''

#### Evaluation

In [None]:
# Reload backtranslated_ain.txt as dataframe
df = pd.read_csv(f'{csv_prefix}_{src_lang}_{cycle}.txt', sep='\t', lineterminator='\n')

# Evaluate
score = poysuwop.evaluate(df)

print(f"BLEU score: {score['BLEU']}")
print(f"TER score: {score['TER']}")
print(f"chrF score: {score['chrF']}")

BLEU score: 11.688427818534283
TER score: 82.33352479719255
chrF score: 26.61296174853196


## 3: Source language corpus augmentation only

### 1: 1st Cycle

#### Setup

In [None]:
cycle = 1

# Setup model path
forward_model_path = f'./{csv_prefix}-finetuned-{src_lang}-{tgt_lang}-unidir_{cycle}'
backward_model_path = f'./{csv_prefix}-finetuned-{tgt_lang}-{src_lang}-unidir_{cycle}'

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

#### Finetune Ain -> Jpn

In [None]:
# Finetune Ain -> Jpn
poysuwop.create_finetuned_model(
    model,
    df,
    src_lang,
    tgt_lang,
    forward_tokenizer,
    forward_model_path # save_path
)

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/26496 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/6625 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.7497,0.647615
2,0.5058,0.456668
3,0.4023,0.374301
4,0.3131,0.31914
5,0.2479,0.288352
6,0.2096,0.263475
7,0.1694,0.247097
8,0.1421,0.241117
9,0.1185,0.238958
10,0.0967,0.237148


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameter

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


#### Finetune Jpn -> Ain

In [None]:
# Finetune Jpn -> Ain
poysuwop.create_finetuned_model(
    model,
    df,
    tgt_lang,
    src_lang,
    backward_tokenizer,
    backward_model_path # save_path
)

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/26496 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/6625 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.5913,0.506247
2,1.6819,0.431265


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


#### Cyclic translate

In [None]:
# Cyclic translate
forward_model = MBartForConditionalGeneration.from_pretrained(forward_model_path)
backward_model = MBartForConditionalGeneration.from_pretrained(backward_model_path)

df = poysuwop.cyclic_translate(
    df,
    src_lang,
    forward_model,
    forward_tokenizer,
    backward_model,
    backward_tokenizer
)

# Save csv
df.to_csv(f'{csv_prefix}_unidir_{src_lang}_{cycle}.txt', index=False, encoding='utf-8', sep='\t')

#### Evaluation

In [None]:
# Reload backtranslated_ain.txt as dataframe
df = pd.read_csv(f'{csv_prefix}_unidir_{src_lang}_{cycle}.txt', sep='\t')

# BLEU
score = poysuwop.evaluate(df)
print(f"BLEU score: {score['BLEU']}")
print(f"TER score: {score['TER']}")
print(f"chrF score: {score['chrF']}")

BLEU score: 29.901222208801194
chrF score: 48.20237564963054
TER score: 58.661116837541606


#### Merge backtranslated text to original dataframe

In [6]:
# Reload backtranslated_ain.txt as dataframe
cycle = 1

df = pd.read_csv(f'{csv_prefix}_unidir_{src_lang}_{cycle}.txt', sep='\t')

# Merge backtranslated text to original df
df = poysuwop.merge_backtranslated_text(df, original_df, cycle, src_lang)

#### Test

##### Ain -> Jpn

In [None]:
# Sample text
sample_text = ["teeta okay aynu utar opitta kira wa isam.", "昔いた人たちはみんな逃げていなくなった。"]

# Sample source text and corresponding reference translation
source_text = sample_text[0]
gold_translation = sample_text[1]

# Generate translation
translated_text = translate(source_text, 'ain', 'jpn')

def tokenize_japanese(text):
    mecab = MeCab.Tagger("-Owakati")
    return mecab.parse(text).strip()

# Tokenize
tokenized_translation = tokenize_japanese(translated_text)
tokenized_gold = tokenize_japanese(gold_translation)

# BLEU
bleu = sacrebleu.corpus_bleu([tokenized_translation], [[tokenized_gold]])

print("Translated text:", translated_text)
print("BLEU score:", bleu.score)

Translated text: 昔のことたち、人間たちはみんな逃げてしまったのだ。
BLEU score: 25.748661016289674


In [None]:
source_text = "teeta okay aynu utar opitta kira wa isam."
print(translate(source_text, 'ain', 'jpn')) # Gold: 昔いた人たちはみんな逃げていなくなった。

昔の人間たちはみんな逃げてしまいました。


In [None]:
source_text = "kotan kor kamuy oka an ruwe kuþ nukar." #(lit.) kotan kor kamuy ku=nukar.
print(translate(source_text, 'ain', 'jpn')) # Gold: フクロウ/村神がいることを私は見た。

村の神様がいるのを見ていました。


In [None]:
source_text = "kotan kor kamuy oka an ruwe aþ nukar." #(lit.) kotan kor kamuy a=nukar.
print(translate(source_text, 'ain', 'jpn')) # Gold: 我はフクロウ/村神がいることを見た。

村の神がいるのを私は見ていました。


In [None]:
source_text = "kotan kor kamuy nukar ka somo ki."
print(translate(source_text, 'ain', 'jpn')) # Gold: (彼は) フクロウ/村神を見ていない。

村の神を見ることもありません。


In [None]:
source_text = "pis ta okkaypo utar uwekarpa wa caranke kor an."
print(translate(source_text, 'ain', 'jpn')) # Gold: 浜辺で若者たちが集まって談判をしていました。

浜で若者たちが集まってきて談判をしていました。


In [None]:
source_text = "sisam mosir un hosippa hi ora a=oyamokte itak patek ye yak aþ ye." #(lit.) panampe sisammosir un hosippa hi ora a=oyamokte oruspe patek ye yak a=ye.
print(translate(source_text, 'ain', 'jpn')) # Gold: (彼は) 和人のところから帰ってきた時から、おかしな話ばかり言っているそうだ。

和人の帰ってきたことを何度も言葉ばかりだと言いました。


##### Jpn -> Ain

In [None]:
# Sample text
sample_text = ["teeta okay aynu utar opitta kira wa isam.", "昔いた人たちはみんな逃げていなくなった。"]

# Sample source text and corresponding reference translation
source_text = sample_text[1]
gold_translation = sample_text[0]

# Generate translation
translated_text = translate(source_text, 'jpn', 'ain')

# BLEU
bleu = sacrebleu.corpus_bleu([translated_text], [[gold_translation]])

print("Translated text:", translated_text)
print("BLEU score:", bleu.score)

Translated text: teeta kane oka utar opitta kira wa isam
BLEU score: 47.750342648354646


### 2: 2nd Cycle

#### Setup

In [7]:
cycle = 2

# Setup model path
forward_model_path = f'./{csv_prefix}-finetuned-{src_lang}-{tgt_lang}-unidir_{cycle}'
backward_model_path = f'./{csv_prefix}-finetuned-{tgt_lang}-{src_lang}-unidir_{cycle}'

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

#### Finetune Ain -> Jpn

In [None]:
# Finetune Ain -> Jpn
poysuwop.create_finetuned_model(
    model,
    df,
    src_lang,
    tgt_lang,
    forward_tokenizer,
    forward_model_path # save_path
)

  self.pid = os.fork()


#### Finetune Jpn -> Ain

In [None]:
# Finetune Jpn -> Ain
poysuwop.create_finetuned_model(
    model,
    df,
    tgt_lang,
    src_lang,
    backward_tokenizer,
    backward_model_path # save_path
)

  self.pid = os.fork()


Epoch 1 completed.
Validation Loss: 0.859958898153526
Epoch 2 completed.
Validation Loss: 0.6078454085423026
Epoch 3 completed.
Validation Loss: 0.48875267675369666
Epoch 4 completed.
Validation Loss: 0.4167750763619437
Epoch 5 completed.
Validation Loss: 0.3916988589893942
Epoch 6 completed.
Validation Loss: 0.386222485972244


#### Cyclic translate

In [None]:
# Cyclic translate
forward_model = MBartForConditionalGeneration.from_pretrained(forward_model_path)
backward_model = MBartForConditionalGeneration.from_pretrained(backward_model_path)

df = poysuwop.cyclic_translate(
    df,
    src_lang,
    forward_model,
    forward_tokenizer,
    backward_model,
    backward_tokenizer
)

# Save csv
df.to_csv(f'{csv_prefix}_unidir_{src_lang}_{cycle}.txt', index=False, encoding='utf-8', sep='\t')

#### Evaluation


In [None]:
# Reload backtranslated_ain.txt as dataframe
df = pd.read_csv(f'{csv_prefix}_unidir_{src_lang}_{cycle}.txt', sep='\t')

# BLEU
score = poysuwop.evaluate(df)
print(f"BLEU score: {score['BLEU']}")
print(f"TER score: {score['TER']}")
print(f"chrF score: {score['chrF']}")

#### Merge backtranslated text to original dataframe

In [None]:
# Merge backtranslated text to original df
df = poysuwop.merge_backtranslated_text(df, original_df, cycle, src_lang)

### 3: Summary

In [None]:
print('Simple IBT: IBT only using source and target language')

for cycle in range(3):
    print(f"Cycle {cycle}")

    # Reload backtranslated_ain.txt as dataframe
    if cycle == 0:
        df = pd.read_csv(f'{csv_prefix}_unidir_{src_lang}_{cycle}.txt', sep='\t', lineterminator='\n')

    else:
        df = pd.read_csv(f'{csv_prefix}_unidir_{src_lang}_{cycle}.txt', sep='\t')

    # Evaluate
    score = poysuwop.evaluate(df)

    print(f"BLEU score: {score['BLEU']}")
    print(f"TER score: {score['TER']}")
    print(f"chrF score: {score['chrF']}")
    print('\n')

## 4: Source & Target language corpus augmentation

### 1: 1st Cycle / jpn -> eng -> jpn

In [None]:
cycle = 1

df_jej = poysuwop.cyclic_google_translate(df)

df_jej.to_csv(f'google_backtranslated_jpn_{cycle}.txt', index=False, encoding='utf-8', sep='\t')

### 2: 2nd Cycle

#### Dataset

##### ain

In [None]:
cycle = 1

original_df = df.copy()

# Load backtranslated_ain.txt as dataframe
df_ain = pd.read_csv(f'{csv_prefix}_unidir_{src_lang}_{cycle}.txt', sep='\t')

# Merge backtranslated text to original df
df_ain = poysuwop.merge_backtranslated_text(df_ain, original_df, cycle, 'ain')

del df_ain['jpn']

#sort column ain by alphabetical order
df_ain = df_ain.sort_values(by=['no.'])

##### jpn

In [None]:
cycle = 1

original_df = df.copy()

# Load backtranslated_jpn.txt as dataframe
df_jpn = pd.read_csv(f'google_backtranslated_jpn_{cycle}.txt', sep='\t')

# Merge backtranslated text to original df
df_jpn = poysuwop.merge_backtranslated_text(df_jpn, original_df, cycle, 'jpn')

#drop ain, eng
del df_jpn['ain']
del df_jpn['eng']

#sort column jpn by alphabetical order
df_jpn = df_jpn.sort_values(by=['no.'])

##### combine

In [None]:
# combine df_ain and df_jpn with no.
df_combined = pd.merge(df_ain, df_jpn, on='no.', how='outer')

# sort column no, src_backtranslated_cycles, tgt_backtranslated_cycles by alphabetical order
df_combined = df_combined.sort_values(
    ['no.', 'src_backtranslated_cycles', 'tgt_backtranslated_cycles'],
    ascending=[True, True, True]
    )

# add subno column to show count by no.
df_combined['subno.'] = df_combined.groupby('no.').cumcount() + 1

#### Setup

In [None]:
cycle = 2

# Setup model path
forward_model_path = f'./{csv_prefix}-finetuned-{src_lang}-{tgt_lang}-bidir_{cycle}'
backward_model_path = f'./{csv_prefix}-finetuned-{tgt_lang}-{src_lang}-bidir_{cycle}'

#### Finetune Ain -> Jpn

In [None]:
# Finetune Ain -> Jpn
poysuwop.create_finetuned_model(
    model,
    df_combined,
    src_lang,
    tgt_lang,
    forward_tokenizer,
    forward_model_path # save_path
)

#### Finetune Jpn -> Ain

In [None]:
# Finetune Jpn -> Ain
poysuwop.create_finetuned_model(
    model,
    df_combined,
    tgt_lang,
    src_lang,
    backward_tokenizer,
    backward_model_path # save_path
)

#### Cyclic translate

In [None]:
# Cyclic translate
forward_model = MBartForConditionalGeneration.from_pretrained(forward_model_path)
backward_model = MBartForConditionalGeneration.from_pretrained(backward_model_path)

df = poysuwop.cyclic_translate(
    df_combined,
    src_lang,
    forward_model,
    forward_tokenizer,
    backward_model,
    backward_tokenizer
)

# Save csv
df.to_csv(f'{csv_prefix}_bidir_{src_lang}_{cycle}.txt', index=False, encoding='utf-8', sep='\t')

#### Evaluation

In [None]:
cycle = 2

# Reload backtranslated_ain.txt as dataframe
df = pd.read_csv(
    f'{csv_prefix}_bidir_{src_lang}_{cycle}.txt',
    sep='\t'
)

# Evaluate
score_bidir = poysuwop.evaluate(df)

print(f"BLEU score: {score_bidir['BLEU']}")
print(f"TER score: {score_bidir['TER']}")
print(f"chrF score: {score_bidir['chrF']}")

BLEU score: 43.32152835601154
TER score: 48.7512681809805
chrF score: 58.26124745968379


##### Compare with Simple IBT

In [None]:
print('Simple IBT: IBT only using source and target language\n')

for cycle in range(3):
    print(f"Cycle {cycle}")

    # Reload backtranslated_ain.txt as dataframe
    if cycle == 0:
        df = pd.read_csv(f'{csv_prefix}_unidir_{src_lang}_{cycle}.txt', sep='\t', lineterminator='\n')

    else:
        df = pd.read_csv(f'{csv_prefix}_unidir_{src_lang}_{cycle}.txt', sep='\t')

    # Evaluate
    score = poysuwop.evaluate(df)

    print(f"BLEU score: {score['BLEU']}")
    print(f"TER score: {score['TER']}")
    print(f"chrF score: {score['chrF']}")

Simple IBT: IBT only using source and target language
Cycle 0
BLEU score: 11.688427818534283
TER score: 82.33352479719255
chrF score: 26.61296174853196


Cycle 1
BLEU score: 29.901222208801194
TER score: 58.661116837541606
chrF score: 48.20237564963054


Cycle 2
BLEU score: 52.76071606795754
TER score: 40.08166097114446
chrF score: 65.60754391897386


