<a href="https://colab.research.google.com/github/nonotoy/poysuwop/blob/main/02_Poysuwop_Cyclic_Translation_AJA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Poysuwop_Nr2 / Cyclic Translation Ain -> Jpn -> Ain

## MBart

### 0_Library install

In [None]:
!pip install transformers[torch] datasets sentencepiece sacremoses sacrebleu mecab-python3 unidic-lite
!pip install accelerate -U

In [2]:
# Library
import glob
import json
import re
import collections
import os
import sys

import pandas as pd
import torch
from transformers import (
    pipeline,
    MBartForConditionalGeneration,
    MBart50Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    RobertaTokenizerFast,
    GenerationConfig
)
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
import sacrebleu
import MeCab

# Change Current Directory
os.chdir('/content/drive/MyDrive/Colab Notebooks/Poysuwop')

# Load preprocess module
from modules import ainPreprocess

### 1_Load Dataset

In [3]:
# Setup - model & tokenizer
model_name = 'facebook/mbart-large-50-many-to-many-mmt'
tokenizer = MBart50Tokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Load dataset
file_path = 'poysuwop_corpus.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

data = [line.strip().split('\t') for line in lines]

line_no, ain_txt, jpn_txt = zip(*[(parts[0], parts[1], parts[2]) for parts in data])

# Store to df
df = pd.DataFrame({
    'no.': line_no,
    'ain': ain_txt,
    'jpn': jpn_txt
})

dataset = Dataset.from_pandas(df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

### 2_Train translator

In [None]:
def train_translator(src_lang, tgt_lang, cycle=0)

    # Load tokenizer
    if src_lang == 'ain' and tgt_lang == 'jpn':
        tokenizer = RobertaTokenizerFast.from_pretrained("AinuBERTTokenizer")

    elif src_lang == 'jpn' and tgt_lang == 'ain':
        tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang='ja_XX')

    else:
        raise ValueError("src_lang and/or tgt_lang must be 'ain' or 'jpn'.")

    # Load model
    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Split train & test data
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    train_data = {
        "source": train_df[src_lang].tolist(),
        "target": train_df[tgt_lang].tolist()
    }

    eval_data = {
        "source": test_df[src_lang].tolist(),
        "target": test_df[tgt_lang].tolist()
    }

    # Convert to Dataset
    train_dataset = DatasetDict({'train': Dataset.from_dict(train_data)})
    eval_dataset = DatasetDict({'eval': Dataset.from_dict(eval_data)})

    tokenized_train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=["source", "target"])
    tokenized_eval_dataset = eval_dataset.map(tokenize, batched=True, remove_columns=["source", "target"])

    # Generation Config
    generation_config = GenerationConfig(
        max_length=200,
        early_stopping=True,
        num_beams=5,
        forced_eos_token_id=2
    )

    # Trainer Arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir = './results',
        evaluation_strategy = 'epoch',
        learning_rate = 2e-5,
        per_device_train_batch_size = 4,
        per_device_eval_batch_size = 4,
        weight_decay = 0.01,
        save_total_limit = 2,
        num_train_epochs = 100,
        gradient_accumulation_steps = 4,
        fp16 = True if torch.cuda.is_available() else False,
        load_best_model_at_end = True,
        metric_for_best_model = 'eval_loss',
        save_strategy = 'epoch'
    )

    # Early stopping callback
    early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

    # Trainer
    trainer = Seq2SeqTrainer(
        model = model,
        args=training_args,
        train_dataset=tokenized_train_dataset['train'],
        eval_dataset=tokenized_eval_dataset['eval'],
        tokenizer=tokenizer,
        callbacks=[early_stopping_callback],
        generation_config=generation_config
    )

    # Training
    trainer.train()

    # Evaluate
    trainer.evaluate()

    # Save finetuned model & tokenizer
    save_path = './{0}_{1}_finetuned_model_{2}'.format(src_lang, tgt_lang, cycle)
    model.save_pretrained(save_path)

    if src_lang == 'ain':
        tokenizer.save_pretrained(save_path)

    print("Model saved at:", save_path)
    print('------------------')


# Tokenize function
def tokenize(examples):
    inputs = examples['source']
    targets = examples['target']

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Train
train_translator('ain', 'jpn', cycle=0)
train_translator('jpn', 'ain', cycle=0)

### 3_Set up Translate formula

In [22]:
# Setup - model & tokenizer
model_aj = MBartForConditionalGeneration.from_pretrained("./ain_jpn_finetuned_model_0")
tokenizer_aj = RobertaTokenizerFast.from_pretrained("AinuBERTTokenizer")

model_ja = MBartForConditionalGeneration.from_pretrained("./jpn_ain_finetuned_model_0")
tokenizer_ja = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang='ja_XX')

# Setup - GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_aj.to(device)
model_ja.to(device)

def translate(text, src_lang, tgt_lang):

    if src_lang == 'ain':
        model = model_aj
        tokenizer = tokenizer_aj

    elif src_lang == 'jpn':
        model = model_ja
        tokenizer = tokenizer_ja

    else:
        raise ValueError("src_lang must be 'ain' or 'jpn'.")

    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True).to(device)

    # Translate
    translated_tokens = model.generate(**inputs,
                                       max_length=128,
                                       num_beams=4,
                                       early_stopping=True,
                                       decoder_start_token_id=model.config.decoder_start_token_id
                                       )

    # Decode
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

    return translated_text

#### Test

##### Ain -> Jpn

In [23]:
# Sample text
sample_text = ["teeta okay aynu utar opitta kira wa isam.", "昔いた人たちはみんな逃げていなくなった。"]

# Sample source text and corresponding reference translation
source_text = sample_text[0]
gold_translation = sample_text[1]

# Generate translation
translated_text = translate(source_text, 'ain', 'jpn')

def tokenize_japanese(text):
    mecab = MeCab.Tagger("-Owakati")
    return mecab.parse(text).strip()

# Tokenize
tokenized_translation = tokenize_japanese(translated_text)
tokenized_gold = tokenize_japanese(gold_translation)

# BLEU
bleu = sacrebleu.corpus_bleu([tokenized_translation], [[tokenized_gold]])

print("Translated text:", translated_text)
print("BLEU score:", bleu.score)

Translated text: 昔のことたち、人間たちはみんな逃げてしまったのだ。
BLEU score: 25.748661016289674


In [None]:
source_text = "teeta okay aynu utar opitta kira wa isam."
print(translate(source_text, 'ain', 'jpn')) # Gold: 昔いた人たちはみんな逃げていなくなった。

昔の人間たちはみんな逃げてしまいました。


In [None]:
source_text = "kotan kor kamuy oka an ruwe kuþ nukar." #(lit.) kotan kor kamuy ku=nukar.
print(translate(source_text, 'ain', 'jpn')) # Gold: フクロウ/村神がいることを私は見た。

村の神様がいるのを見ていました。


In [None]:
source_text = "kotan kor kamuy oka an ruwe aþ nukar." #(lit.) kotan kor kamuy a=nukar.
print(translate(source_text, 'ain', 'jpn')) # Gold: 我はフクロウ/村神がいることを見た。

村の神がいるのを私は見ていました。


In [None]:
source_text = "kotan kor kamuy nukar ka somo ki."
print(translate(source_text, 'ain', 'jpn')) # Gold: (彼は) フクロウ/村神を見ていない。

村の神を見ることもありません。


In [None]:
source_text = "pis ta okkaypo utar uwekarpa wa caranke kor an."
print(translate(source_text, 'ain', 'jpn')) # Gold: 浜辺で若者たちが集まって談判をしていました。

浜で若者たちが集まってきて談判をしていました。


In [None]:
source_text = "sisam mosir un hosippa hi ora a=oyamokte itak patek ye yak aþ ye." #(lit.) panampe sisammosir un hosippa hi ora a=oyamokte oruspe patek ye yak a=ye.
print(translate(source_text, 'ain', 'jpn')) # Gold: (彼は) 和人のところから帰ってきた時から、おかしな話ばかり言っているそうだ。

和人の帰ってきたことを何度も言葉ばかりだと言いました。


##### Jpn -> Ain

In [24]:
# Sample text
sample_text = ["teeta okay aynu utar opitta kira wa isam.", "昔いた人たちはみんな逃げていなくなった。"]

# Sample source text and corresponding reference translation
source_text = sample_text[1]
gold_translation = sample_text[0]

# Generate translation
translated_text = translate(source_text, 'jpn', 'ain')

# BLEU
bleu = sacrebleu.corpus_bleu([translated_text], [[gold_translation]])

print("Translated text:", translated_text)
print("BLEU score:", bleu.score)

Translated text: teeta kane oka utar opitta kira wa isam
BLEU score: 47.750342648354646


### 4_Cyclic translate

In [None]:
# Run 3. Translate formula at first

# Language
src_lang = 'ain'
tgt_lang = 'jpn'

# Set lists from corpus dataframe
texts_no = df['no.'].tolist()
src_texts = df['ain'].tolist()

for i, original in enumerate(src_texts):

    # Translate from ain to jpn
    translated_text = translate(original, src_lang, tgt_lang)

    # Back translate from jpn to ain
    backtranslated_text = translate(translated_text, tgt_lang, src_lang)

    # add to df
    df.loc[i, 'translated_jpn'] = translated_text
    df.loc[i, 'backtranslated_ain'] = backtranslated_text

    if i % 5000 == 0 or i == len(src_texts):
        print(i, '/', len(src_texts))

df.to_csv('backtranslated_ain.txt', index=False, encoding='utf-8', sep='\t')

0 / 33121
100 / 33121
200 / 33121
300 / 33121
400 / 33121
500 / 33121
600 / 33121
700 / 33121
800 / 33121
900 / 33121
1000 / 33121
1100 / 33121
1200 / 33121
1300 / 33121
1400 / 33121
1500 / 33121
1600 / 33121
1700 / 33121
1800 / 33121
1900 / 33121
2000 / 33121
2100 / 33121
2200 / 33121
2300 / 33121
2400 / 33121
2500 / 33121
2600 / 33121
2700 / 33121
2800 / 33121
2900 / 33121
3000 / 33121
3100 / 33121
3200 / 33121
3300 / 33121
3400 / 33121
3500 / 33121
3600 / 33121
3700 / 33121
3800 / 33121
3900 / 33121
4000 / 33121
4100 / 33121
4200 / 33121
4300 / 33121
4400 / 33121
4500 / 33121
4600 / 33121
4700 / 33121
4800 / 33121
4900 / 33121
5000 / 33121
5100 / 33121
5200 / 33121
5300 / 33121
5400 / 33121
5500 / 33121
5600 / 33121
5700 / 33121
5800 / 33121
5900 / 33121
6000 / 33121
6100 / 33121
6200 / 33121
6300 / 33121
6400 / 33121
6500 / 33121
6600 / 33121
6700 / 33121
6800 / 33121
6900 / 33121
7000 / 33121
7100 / 33121
7200 / 33121
7300 / 33121
7400 / 33121
7500 / 33121
7600 / 33121
7700 / 3312

# Archive

### 1_Library

In [None]:
import os
import random
import math

import torch
import torch.nn as nn
import torch.optim as optim
from datasets import Dataset
import datasets
from torch.nn.utils.rnn import pad_sequence
import spacy
import numpy as np
from transformers import TFMT5ForConditionalGeneration

# Change Current Directory
os.chdir('/content/drive/MyDrive/Colab Notebooks/Poysuwop')

# Load preprocess module
#sys.path.append('/content/drive/MyDrive/Colab Notebooks/Poysuwop')
from modules import ainPreprocess

### 2_Tokenizers

In [None]:
from tokenizers import Tokenizer
from transformers import RobertaTokenizerFast, AutoTokenizer

ain_tokenizer = RobertaTokenizerFast.from_pretrained("AinuBERTTokenizer")
input_string = ainPreprocess.preprocess("ohonno somo unukar=an")
print(ain_tokenizer(input_string))

jpn_tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
# print(jpn_tokenizer("久しぶりだね"))

{'input_ids': [0, 2272, 394, 4100, 272, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/258k [00:00<?, ?B/s]

#### データ読み込み

In [None]:
from datasets import Dataset
import pandas as pd

# read txt file
df_ain = pd.read_csv("poysuwop_ain.txt", sep="\t", header=None)
df_jpn = pd.read_csv("poysuwop_jpn.txt", sep="\t", header=None)

# rename columns
df_ain.columns = ["ain"]
df_jpn.columns = ["jpn"]

def gen():
    for i in range(len(df_ain)):
        yield {
            'ain': df_ain["ain"][i],
            'jpn': df_jpn["jpn"][i]
        }

ds = Dataset.from_generator(gen)
for example in ds:
    print(example)

print(len(ds))

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
{'ain': 'okamkir', 'jpn': 'わざと'}
{'ain': 'sisam utari', 'jpn': '和人'}
{'ain': 'sisam', 'jpn': '和人'}
{'ain': 'hempara ne yakka raykoraci patek kþ an wa', 'jpn': 'いつでも、死ぬような思いばかりして暮らしているよ。'}
{'ain': 'kurmat', 'jpn': '和人の女'}
{'ain': 'iyoyra ruy', 'jpn': '物忘れが激しい、忘れっぽい'}
{'ain': 'oyra', 'jpn': '…を忘れる'}
{'ain': 'ponno', 'jpn': 'わずかに'}
{'ain': 'mimaraha', 'jpn': '…の余った残り'}
{'ain': 'hon utursam', 'jpn': '横腹'}
{'ain': 'taan topenpe usaraye yan', 'jpn': 'このお菓子を別々にしなさい。'}
{'ain': 'paykar an kor upas ru wa apkas ka icakkere', 'jpn': '春になると雪が解けて歩くのも汚い（？）。'}
{'ain': 'aa toan matkaci ramuan hawe', 'jpn': 'ああ、あの娘は利口だ。'}
{'ain': 'uosurpa', 'jpn': '離婚する'}
{'ain': 'kasi opiwki', 'jpn': '助けに行く'}
{'ain': 'tusunike', 'jpn': 'リス、キネズミ'}
{'ain': 'pirka', 'jpn': '立派だ'}
{'ain': 'na ponno enþ kore', 'jpn': 'もうちょっとください。'}
{'ain': 'kama or wakka aþ omare wa aþ sesekka akusu tane pa at na yanke yan', 'jpn': '鉄瓶に水を入れて沸かしたたら、もう湯気が立っているから、鉄瓶を（火から）あげなさい。'}
{'ain': 'kesukuran

#### no use - dataloader

In [None]:
import torch
from torch.utils.data.dataset import random_split
import torchdata.datapipes as dp
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext.transforms as T
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset

def tokenize(text):
    # テキストを単語に分割する処理。この例ではスペースで分割。
    return text.split()

# データパイプラインの定義
datapipe_ain = dp.iter.FileOpener(['poysuwop_ain.txt'], mode='rt'). \
                   readlines(return_path=False). \
                   map(tokenize). \
                   flatten()

# データ読み込み、単語分割(日本語)
datapipe_jpn = dp.iter.FileOpener(['poysuwop_jpn.txt'], mode='rt'). \
                   readlines(return_path=False). \
                   map(tokenize). \
                   flatten()

# アイヌ語・日本語ペアに
datapipe = datapipe_ain.zip(datapipe_jpn)

# 単語辞書作成(アイヌ語)
ain_vocab = build_vocab_from_iterator(datapipe_ain, specials=('<unk>', '<pad>', '<s>', '</s>'))
ain_vocab.set_default_index(ain_vocab['<unk>'])

# 単語辞書作成(日本語)
jpn_vocab = build_vocab_from_iterator(datapipe_jpn, specials=('<unk>', '<pad>', '<s>', '</s>'))
jpn_vocab.set_default_index(jpn_vocab['<unk>'])

# transform生成
ain_transform = T.Sequential(
    T.VocabTransform(ain_vocab),
    T.AddToken(ain_vocab['<s>'], begin=True),
    T.AddToken(ain_vocab['</s>'], begin=False),
    T.ToTensor(padding_value=ain_vocab['<pad>'])
)

jpn_transform = T.Sequential(
    T.VocabTransform(jpn_vocab),
    T.AddToken(jpn_vocab['<s>'], begin=True),
    T.AddToken(jpn_vocab['</s>'], begin=False),
    T.ToTensor(padding_value=jpn_vocab['<pad>'])
)

# ミニバッチ時のデータ変換関数
def collate_batch(batch):
    ens = ain_transform([src for (src, trg) in batch])
    jas = jpn_transform([trg for (src, trg) in batch])
    return ens, jas

# mapに変換
ds = to_map_style_dataset(datapipe)
# DataLoader設定
# data_loader = DataLoader(ds, shuffle=True, batch_size=3, collate_fn=collate_batch)

# 元のデータセットのサイズを取得
dataset_size = len(ds)

# 訓練セットと検証セットのサイズを指定
train_size = int(dataset_size * 0.8)
val_size = dataset_size - train_size

# データセットをランダムに分割
train_dataset, val_dataset = random_split(ds, [train_size, val_size])

# 訓練用 DataLoader
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=3, collate_fn=collate_batch)

# 検証用 DataLoader
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=3, collate_fn=collate_batch)

In [None]:

import torch
from torch.utils.data.dataset import random_split

# アイヌ語・日本語ペアに
datapipe = datapipe_ain.zip(datapipe_jpn)

# 単語辞書作成(アイヌ語)
ain_vocab = build_vocab_from_iterator(datapipe_ain, specials=('<unk>', '<pad>', '<s>', '</s>'))
ain_vocab.set_default_index(ain_vocab['<unk>'])

# 単語辞書作成(日本語)
jpn_vocab = build_vocab_from_iterator(datapipe_jpn, specials=('<unk>', '<pad>', '<s>', '</s>'))
jpn_vocab.set_default_index(jpn_vocab['<unk>'])

# transform生成
ain_transform = T.Sequential(
    T.VocabTransform(ain_vocab),
    T.AddToken(ain_vocab['<s>'], begin=True),
    T.AddToken(ain_vocab['</s>'], begin=False),
    T.ToTensor(padding_value=ain_vocab['<pad>'])
)

jpn_transform = T.Sequential(
    T.VocabTransform(jpn_vocab),
    T.AddToken(jpn_vocab['<s>'], begin=True),
    T.AddToken(jpn_vocab['</s>'], begin=False),
    T.ToTensor(padding_value=jpn_vocab['<pad>'])
)

# ミニバッチ時のデータ変換関数
def collate_batch(batch):
    ens = ain_transform([src for (src, trg) in batch])
    jas = jpn_transform([trg for (src, trg) in batch])
    return ens, jas

# mapに変換
ds = to_map_style_dataset(datapipe)
# DataLoader設定
# data_loader = DataLoader(ds, shuffle=True, batch_size=3, collate_fn=collate_batch)

# 元のデータセットのサイズを取得
dataset_size = len(ds)

# 訓練セットと検証セットのサイズを指定
train_size = int(dataset_size * 0.8)
val_size = dataset_size - train_size

# データセットをランダムに分割
train_dataset, val_dataset = random_split(ds, [train_size, val_size])

# 訓練用 DataLoader
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=3, collate_fn=collate_batch)

# 検証用 DataLoader
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=3, collate_fn=collate_batch)

In [None]:
"""
.get_stoi()
Returns:
    Dictionary mapping tokens to indices.
"""

ain_vocab.get_stoi()

{'yupta': 2849,
 'yupnatara': 2848,
 'yokohama': 2844,
 'yen': 2843,
 'yaysamne': 2836,
 'yayramsitne': 2835,
 'yayramnuyna': 2834,
 'yaynumwen': 2832,
 'yaykosiramsuypa': 2831,
 'yaykomismu': 2830,
 'yaykane': 2829,
 'yayeinukuri': 2825,
 'yayciskar': 2824,
 'yaycire': 2823,
 'yayasis': 2822,
 'yaunu': 2819,
 'yatupoknuma': 2817,
 'yasa': 2815,
 'yas': 2814,
 'yar': 2812,
 'yaoskep': 2811,
 'yamsu': 2810,
 'yamni': 2809,
 'yakayaka': 2807,
 'yaka': 2806,
 'yaetaye': 2805,
 'yacipocipoci': 2804,
 'y': 2803,
 'wose': 2802,
 'woruncikap': 2801,
 'wo': 2800,
 'weysampe': 2799,
 'wenpurikor': 2798,
 'wenkur': 2796,
 'wenipokas': 2795,
 'wempe': 2794,
 'wakasa': 2792,
 'uwotutanpa': 2791,
 'uwepakita': 2786,
 'uwenucaktek': 2785,
 'yuptek': 2850,
 'uwenewsar': 2784,
 'uwekari': 2783,
 'uweerepak': 2782,
 'uwa': 2781,
 'utuyaskarap': 2780,
 'uturunpe': 2779,
 'utursam': 2778,
 'utura': 2776,
 'utuman': 2774,
 'utka': 2771,
 'utekanpa': 2770,
 'utaspa': 2767,
 'utarihi': 2765,
 'utare': 2764,

In [None]:
jpn_vocab.get_stoi()

{'～を～にひっかける': 4510,
 '～を（家に）入れる;～を（場所）に入れる': 4505,
 '～を飲む;酒を飲む': 4502,
 '～を隠す': 4499,
 '～を陸に上げる': 4498,
 '～を選ぶ': 4495,
 '～を連れて行く': 4494,
 '～を負ぶう': 4484,
 '～を読む': 4483,
 '～を見失う、～がみつからない': 4481,
 '～を置く;～を置く;～を置く': 4476,
 '～を縛る': 4475,
 '～を締める': 4473,
 '～を終える': 4472,
 '～を立てる': 4470,
 '～を突く': 4468,
 '～を穴の中に入れる（？）': 4465,
 '～を研ぐ': 4463,
 '～を煮る': 4458,
 '～を沸かす;～を暖める': 4455,
 '～を沈める': 4454,
 '～を殺す;～を殺す': 4451,
 '～を止める': 4450,
 '～を欠く': 4449,
 '～を書く': 4448,
 '～を数える;': 4446,
 '～を支払う、～を行かせる、～をやる': 4445,
 '～を掻く': 4444,
 '～を捨てる': 4441,
 '～を捕らえる': 4440,
 '～を拾い集める;～を取る': 4437,
 '～を拭く': 4436,
 '～を投げる': 4434,
 '～を打つ': 4433,
 '～を憐れむ': 4432,
 '～を思う': 4430,
 '～を心配する、～を大事にする': 4427,
 '～を射る;（矢を）射る': 4422,
 '～を射る': 4421,
 '～を始める': 4418,
 '～を壊す': 4416,
 '～を回す': 4415,
 '～を噛む': 4414,
 '～を呼ぶ;～を招待する': 4413,
 '～を取る': 4411,
 '～を取り逃がす、～に遅れる;遅い': 4410,
 '～を切る（複）;～を切る（単）': 4407,
 '～を分かちあう;～を分ける': 4405,
 '～を出す': 4404,
 '～を嫌がる': 4419,
 '～を伸ばす': 4400,
 '～を乾かす': 4399,
 '～を不思議に思う': 4397,
 '～をゆるめる': 4396,
 '～をもらう': 4395,
 '

### Vanilla Transformer Model

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Embedding, Dropout, LayerNormalization, MultiHeadAttention
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Load the tokenizers
source_tokenizer = RobertaTokenizerFast.from_pretrained("AinuBERTTokenizer")
target_tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")

import pandas as pd
from sklearn.model_selection import train_test_split

# read txt file
df_ain = pd.read_csv("poysuwop_ain.txt", sep="\t", header=None)
df_jpn = pd.read_csv("poysuwop_jpn.txt", sep="\t", header=None)

# rename columns
df_ain.columns = ["ain"]
df_jpn.columns = ["jpn"]

# combine dataframes
df = pd.concat([df_ain, df_jpn], axis=1)

# split data into train and validation sets
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

# ハイパーパラメータの設定
num_layers = 6
d_model = 25 #128
dff = 1024 #512
num_heads = 16 #8
dropout_rate = 0.1
batch_size = 32
max_seq_length = 100
source_vocab_size = source_tokenizer.vocab_size
target_vocab_size = target_tokenizer.vocab_size

# create generator functions for train and validation data
def train_dataset():
    for _, row in train_data.iterrows():
        decoder_input = target_tokenizer.encode(row['jpn'], max_length=max_seq_length, padding='max_length', truncation=True)
        target_sequence = decoder_input[1:]
        target_sequence.append(0)  # パディングトークン
        target_one_hot = tf.keras.utils.to_categorical(target_sequence, num_classes=target_vocab_size)
        yield {
            'ain': source_tokenizer.encode(row['ain'], max_length=max_seq_length, padding='max_length', truncation=True),
            'jpn': decoder_input,
            'target': target_one_hot
        }

def val_dataset():
    for _, row in val_data.iterrows():
        decoder_input = target_tokenizer.encode(row['jpn'], max_length=max_seq_length, padding='max_length', truncation=True)
        target_sequence = decoder_input[1:]
        target_sequence.append(0)  # パディングトークン
        target_one_hot = tf.keras.utils.to_categorical(target_sequence, num_classes=target_vocab_size)
        yield {
            'ain': source_tokenizer.encode(row['ain'], max_length=max_seq_length, padding='max_length', truncation=True),
            'jpn': decoder_input,
            'target': target_one_hot
        }

# データセットの作成
train_dataset = tf.data.Dataset.from_generator(
    train_dataset,
    output_signature={
        'ain': tf.TensorSpec(shape=(max_seq_length,), dtype=tf.int32),
        'jpn': tf.TensorSpec(shape=(max_seq_length,), dtype=tf.int32),
        'target': tf.TensorSpec(shape=(max_seq_length, target_vocab_size), dtype=tf.float32)
    }
)

val_dataset = tf.data.Dataset.from_generator(
    val_dataset,
    output_signature={
        'ain': tf.TensorSpec(shape=(max_seq_length,), dtype=tf.int32),
        'jpn': tf.TensorSpec(shape=(max_seq_length,), dtype=tf.int32),
        'target': tf.TensorSpec(shape=(max_seq_length, target_vocab_size), dtype=tf.float32)
    }
)

train_dataset = train_dataset.batch(batch_size)
val_dataset = val_dataset.batch(batch_size)

# Transformerのエンコーダーレイヤー
def transformer_encoder_layer(inputs, d_model, num_heads, dff, rate=0.1):
    attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(inputs, inputs)
    attention_output = Dropout(rate)(attention_output)
    out1 = LayerNormalization(epsilon=1e-6)(inputs + attention_output)

    ffn_output = Dense(dff, activation='relu')(out1)
    ffn_output = Dense(d_model)(ffn_output)
    ffn_output = Dropout(rate)(ffn_output)
    out2 = LayerNormalization(epsilon=1e-6)(out1 + ffn_output)

    return out2

def transformer_decoder_layer(inputs, enc_output, d_model, num_heads, dff, rate=0.1):
    attention_output1 = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(inputs, inputs)
    attention_output1 = Dropout(rate)(attention_output1)
    out1 = LayerNormalization(epsilon=1e-6)(inputs + attention_output1)

    attention_output2 = MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(out1, enc_output)
    attention_output2 = Dropout(rate)(attention_output2)
    out2 = LayerNormalization(epsilon=1e-6)(out1 + attention_output2)

    ffn_output = Dense(dff, activation='relu')(out2)
    ffn_output = Dense(d_model)(ffn_output)
    ffn_output = Dropout(rate)(ffn_output)
    out3 = LayerNormalization(epsilon=1e-6)(out2 + ffn_output)

    return out3

# Transformerモデルの構築
def build_transformer_model(source_vocab_size, target_vocab_size, num_layers, d_model, num_heads, dff, max_seq_length, rate=0.1):
    inputs = Input(shape=(max_seq_length,), name='ain')
    dec_inputs = Input(shape=(max_seq_length,), name='jpn')

    enc_embedding = Embedding(source_vocab_size, d_model)(inputs)
    enc_outputs = enc_embedding

    for _ in range(num_layers):
        enc_outputs = transformer_encoder_layer(enc_outputs, d_model, num_heads, dff, rate)

    dec_embedding = Embedding(target_vocab_size, d_model)(dec_inputs)
    dec_outputs = dec_embedding

    for _ in range(num_layers):
        dec_outputs = transformer_decoder_layer(dec_outputs, enc_outputs, d_model, num_heads, dff, rate)

    outputs = Dense(target_vocab_size, activation='softmax')(dec_outputs)

    model = Model([inputs, dec_inputs], outputs, name='transformer')

    return model

# モデルのコンパイル
model = build_transformer_model(source_vocab_size, target_vocab_size, num_layers, d_model, num_heads, dff, max_seq_length, dropout_rate)
model.compile(optimizer=Adam(beta_1=0.9, beta_2=0.98, epsilon=1e-9), loss='categorical_crossentropy', metrics=['accuracy'])

# モデルの訓練
model.fit(train_dataset.map(lambda x: ({"ain": x["ain"], "jpn": x["jpn"]}, x["target"])),
          epochs=20,
          validation_data=val_dataset.map(lambda x: ({"ain": x["ain"], "jpn": x["jpn"]}, x["target"])))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7dc9280ecd90>

In [None]:
# モデルの保存
save_path = "./translation_model"
tf.keras.models.save_model(model, save_path)

In [None]:
# モデルのロード
loaded_model = tf.keras.models.load_model(save_path)

In [None]:
# テスト用の入力文
test_sentence = "ohonno somo unukar þan"
# test_sentence = "Itak=as aynu itak anu?"

# 入力文をトークン化
test_input = source_tokenizer.encode(test_sentence, max_length=max_seq_length, padding='max_length', truncation=True)
test_input = np.expand_dims(test_input, axis=0)

# デコーダーの初期入力
decoder_input = target_tokenizer.encode("", max_length=max_seq_length, padding='max_length', truncation=True)
output = tf.expand_dims(decoder_input, 0)

# 翻訳ループ
for i in range(max_seq_length):
    predictions = loaded_model([test_input, output[:, :max_seq_length]], training=False)
    predictions = predictions[:, i, :]
    predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

    if predicted_id == target_tokenizer.eos_token_id:
        break

    output = tf.concat([output, tf.reshape(predicted_id, (1, 1))], axis=-1)

# トークンを文字列に変換
predicted_sentence = target_tokenizer.decode(tf.squeeze(output[:, :max_seq_length], axis=0).numpy())
print("Input: ", test_sentence)
print("Translation: ", predicted_sentence)

Input:  ohonno somo unukar þan
Translation:  [CLS] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


### MT5 Fine-tuning

In [None]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Trainer, TrainingArguments

model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base")

In [None]:
train_test_split = ds.train_test_split(test_size=0.2, seed=42)

# 分割されたデータセットを取得
train_dataset = train_test_split["train"]
validation_dataset = train_test_split["test"]

# データセットを結合
dataset = datasets.DatasetDict({"train": train_dataset, "validation": validation_dataset})

# トークナイズ関数を定義
def tokenize_function(examples):
    inputs = examples["ain"]
    targets = examples["jpn"]
    model_inputs = ain_tokenizer(inputs, max_length=128, padding="max_length", truncation=True)
    labels = jpn_tokenizer(targets, max_length=128, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Arguments
training_args = TrainingArguments(
    output_dir="./mt5_results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialise Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# Train
trainer.train()



Map:   0%|          | 0/4038 [00:00<?, ? examples/s]

Map:   0%|          | 0/1010 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,35.3891,16.195009
2,11.7136,4.856948
3,5.5575,3.505664


TrainOutput(global_step=1515, training_loss=17.429033008739108, metrics={'train_runtime': 1699.9015, 'train_samples_per_second': 7.126, 'train_steps_per_second': 0.891, 'total_flos': 3631311209889792.0, 'train_loss': 17.429033008739108, 'epoch': 3.0})

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def translate(text):
    input_ids = ain_tokenizer(text, return_tensors="pt", max_length=128, padding="max_length", truncation=True)
    input_ids = input_ids.to(device)  # 入力テンソルをモデルと同じデバイスに移動
    output = model.generate(**input_ids, max_length=128, num_beams=4, early_stopping=True)
    translated_text = jpn_tokenizer.decode(output[0], skip_special_tokens=True)
    return translated_text

input_text = "ohonno somo unukar þan"
translated_text = translate(input_text)
print("Input:", input_text)
print("Translation:", translated_text)

Input: ohonno somo unukar þan
Translation: [UNK] [UNK] [UNK] [UNK]


In [None]:
output_dir = '/content/drive/MyDrive'
model.save_pretrained(output_dir)

In [None]:
dataset = load_dataset("csv", data_files="train.csv")
dataset = dataset["train"].shuffle(seed=42)

def preprocess_function(examples):
    padding = "max_length"
    max_length = 200

    inputs = [ex for ex in examples["Text"]]
    targets = [ex for ex in examples["Expected"]]
    model_inputs = tokenizer(inputs, max_length=max_length, padding=padding, truncation=True)
    labels = tokenizer(targets, max_length=max_length, padding=padding, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
train_dataset = dataset.map(preprocess_function, batched=True, desc="Running tokenizer")

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=tokenizer.pad_token_id,
    pad_to_multiple_of=64,
    return_tensors="np")

tf_train_dataset = model.prepare_tf_dataset(
    train_dataset,
    collate_fn=data_collator,
    batch_size=8,
    shuffle=True)

In [None]:
model.compile(optimizer=Adam(3e-5))
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
model.fit(tf_train_dataset, epochs=10, callbacks=[early_stopping])