# Poysuwop_Nr1 / Preprocess & Tokenizer
Preprocess Ainu texts and create the bilingual (ain/jpn) corpus

## 1_Preprocess

### 0. Library install

In [1]:
# Library
import glob
import json
import re
import collections
import os
import sys
import unicodedata
import pprint

# Change Current Directory
os.chdir('/content/drive/MyDrive/Colab Notebooks/Poysuwop')

# Load preprocess module
from modules import ainPreprocess

#### Test preprocess module

In [2]:
# -- ex. --
sentence = 'an=an'
print(sentence)
print(ainPreprocess.preprocess(sentence))

# -- ex. --
sentence = 'ohonno somo unukar=an'
print(sentence)
print(ainPreprocess.preprocess(sentence))

# -- ex. --
# sentence = 'ク・ネㇷ゚キ　ヒ　カ　エン・ココパンパ　コㇿカ　ネ　ワ　アンペ　カ　ケシㇼキラㇷ゚　ワ　ク・ネㇷ゚キ　ルスイ　ワ　ケシㇼキラㇷ゚　ワ　クス　（ク・ネㇷ゚キ）　チセ　ソイ　ペカ　クネㇷ゚キ　コㇿ　カン。'
# sentence = 'ペウレクﾙ　ネ　コﾛカ　ア・アイヌコﾛ'
sentence = '“Shirokanipe ranran pishkan, konkanipe ranran pishkan.” arian rekpo chiki kane petesoro sapash aine, ainukotan enkashike chikush kor shichorpokun inkarash ko teeta wenkur tane nishpa ne, teeta nishpa tane wenkur ne kotom shiran.'
print(sentence)
print(ainPreprocess.preprocess(sentence))

# -- ex. --
sentence = 'vv yayán ainu ku=ne ruwe ne korka, ainu itak ani Transformer[123] ku=kor_ rushuy un!'
print(sentence)
print(ainPreprocess.preprocess(sentence))

an=an
an þan
ohonno somo unukar=an
ohonno somo unukar þan
“Shirokanipe ranran pishkan, konkanipe ranran pishkan.” arian rekpo chiki kane petesoro sapash aine, ainukotan enkashike chikush kor shichorpokun inkarash ko teeta wenkur tane nishpa ne, teeta nishpa tane wenkur ne kotom shiran.
sirokanipe ranran piskan konkanipe ranran piskan arian rekpo ciki kane petesoro sapas aine ainukotan enkasike cikus kor sicorpokun inkaras ko teeta wenkur tane nispa ne teeta nispa tane wenkur ne kotom siran
vv yayán ainu ku=ne ruwe ne korka, ainu itak ani Transformer[123] ku=kor_ rushuy un!
yayan ainu kuþ ne ruwe ne korka ainu itak ani transformer kuþ kor rusuy un


### Load Dataset

In [7]:
# Fetch sentence using the 'ain' tag from .json file
# ['ain']       romanized Ainu sentence
# ['jpn']       Japanese sentence

def alphabet_check(text):
    return all(char.isascii() and (char.isalpha() or char.isspace() or char in ["þ", "'", "=", ",", ".", "?", "!",";"]) for char in text)

# For a parallel corpus for ain-jpn translatiom model training
all_corpus = []

aj_parallel_corpus = []
aj_ain_texts = []
aj_jpn_texts = []
aj_ain_raw_texts = []

# For a parallel corpus for jpn-eng fine-tuning
je_parallel_corpus = []

verbalslip_pattern = re.compile(r'(^| [a-zA-Z]*)(\.|x|X){2,}')
jpn_end_pattern = re.compile(r'[。？！]$')

id = 0
cnt = 0

for file_path in glob.glob('/content/drive/MyDrive/Colab Notebooks/Poysuwop/corpus_json/*.json'):
    print('------')
    print(file_path)

    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    for key, value in data.items():

        if key not in {'code', 'title'}:
            if isinstance(value, dict):
                ain_sentence = value.get('ain', '')
                jpn_sentence = value.get('jpn', '')
                eng_sentence = value.get('eng', '')
            else:
                # Handle cases where value is not a dictionary
                print(f"Skipping non-dictionary value for key: {key}")
                continue # Skip to the next iteration

            # Prepare ain-jpn dataset for ain-jpn translation model training
            if ain_sentence and jpn_sentence:

                # Verbal slip: Words with slips should be removed, since it may produce non-existing words
                # This procedure should be omitted from input to Transformer model in order not to create a non-sentence
                if verbalslip_pattern.search(ain_sentence):
                    ain_sentence = re.sub(verbalslip_pattern, ' ', ain_sentence).strip()

                new_ain_sentence = ainPreprocess.preprocess(ain_sentence)

                # Unique ID
                row_id = str(id).zfill(8)

                # Prepare jpn-eng dataset for fine-tuning jpn-eng translation model
                if eng_sentence and jpn_end_pattern.search(jpn_sentence):

                    all_corpus.append(f"{row_id}\t{new_ain_sentence}\t{jpn_sentence}\t{eng_sentence}")
                    #je_parallel_corpus.append(f"{jpn_sentence}\t{eng_sentence}")

                    print(f"{row_id}\t{new_ain_sentence}\t{jpn_sentence}\t{eng_sentence}")
                    cnt += 1

                else:

                    all_corpus.append(f"{row_id}\t{new_ain_sentence}\t{jpn_sentence}")

                #aj_parallel_corpus.append(f"{new_ain_sentence}\t{jpn_sentence}")
                #aj_ain_texts.append(new_ain_sentence)
                #aj_jpn_texts.append(jpn_sentence)
                #aj_ain_raw_texts.append(ain_sentence)

                id += 1

with open('poysuwop_corpus.txt', 'w', encoding='utf-8') as file:
    for text in all_corpus:
        file.write(text + '\n')

------
/content/drive/MyDrive/Colab Notebooks/Poysuwop/corpus_json/AinCps003_TUFS_AA_10.json
------
/content/drive/MyDrive/Colab Notebooks/Poysuwop/corpus_json/AinCps003_TUFS_AA_32.json
------
/content/drive/MyDrive/Colab Notebooks/Poysuwop/corpus_json/AinCps003_TUFS_AA_01.json
------
/content/drive/MyDrive/Colab Notebooks/Poysuwop/corpus_json/AinCps003_TUFS_AA_24.json
------
/content/drive/MyDrive/Colab Notebooks/Poysuwop/corpus_json/AinCps003_TUFS_AA_18.json
------
/content/drive/MyDrive/Colab Notebooks/Poysuwop/corpus_json/AinCps003_TUFS_AA_07.json
------
/content/drive/MyDrive/Colab Notebooks/Poysuwop/corpus_json/AinCps003_TUFS_AA_26.json
------
/content/drive/MyDrive/Colab Notebooks/Poysuwop/corpus_json/AinCps003_TUFS_AA_22.json
------
/content/drive/MyDrive/Colab Notebooks/Poysuwop/corpus_json/AinCps003_TUFS_AA_31.json
------
/content/drive/MyDrive/Colab Notebooks/Poysuwop/corpus_json/AinCps003_TUFS_AA_19.json
------
/content/drive/MyDrive/Colab Notebooks/Poysuwop/corpus_json/Ain

In [None]:
'''
with open('poysuwop_aj_corpus.txt', 'w', encoding='utf-8') as file:
    for text in aj_parallel_corpus:
        file.write(text + '\n')

with open('poysuwop_ain.txt', 'w', encoding='utf-8') as file:
    for text in aj_ain_texts:
        file.write(text + '\n')

with open('poysuwop_jpn.txt', 'w', encoding='utf-8') as file:
    for text in aj_jpn_texts:
        file.write(text + '\n')

with open('poysuwop_je_corpus.txt', 'w', encoding='utf-8') as file:
    for text in je_parallel_corpus:
        file.write(text + '\n')
'''

In [None]:
def check_TBRcharacters(texts):

    ##### Check charcters used in the dataset
    charlist = [char for text in texts for char in list(text)]
    charlist = [k for k, v in collections.Counter(charlist).items() if v > 1]
    charlist_TBR = ''.join(str(x) for x in charlist)
    char_TBR = re.sub(r"[a-zÞA-Z0-9 =]","",charlist_TBR)
    print("Charcaters to be removed: {0}".format(char_TBR))

# check characters to be removed
check_TBRcharacters(aj_ain_raw_texts)

#### Dataset Check

In [None]:
# -- ex. --
print(aj_parallel_corpus[1])
#print(preprocessed_texts[1])

#### Affix Marker Check

In [None]:
# 人称接辞マーカー = の両端に文字が残っているものの抽出
for i in range(len(texts)):
    if re.search(r'\S+=\S+', texts[i]):
        print(texts[i])

In [None]:
for i in range(len(texts)):
    if re.search(r'[lqvx]', texts[i]):
        print(texts[i])

## 2_Tokenizer

In [None]:
! pip install transformers tokenizers
! pip install git+https://github.com/huggingface/transformers.git@main accelerate

In [None]:
from tokenizers import ByteLevelBPETokenizer

paths = ["poysuwop_ain.txt"]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Customize training
tokenizer.train(files=paths, vocab_size=30_522, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

tokenizer.save_model("AinuBERTTokenizer")

#### Check Tokenizer

In [None]:
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained("AinuBERTTokenizer")

sentence = "ohonno somo unukar=an"
sentence = ainPreprocess.preprocess(sentence)
tokenizer(sentence, padding='max_length', max_length=512)

In [None]:
from collections import Counter

texts = [sentence]  # トークナイズするテキストのリスト
tokenized_outputs = [tokenizer(text) for text in texts]  # 各テキストをトークナイズして結果をリストに格納

all_input_ids = []
for output in tokenized_outputs:
    all_input_ids.extend(output['input_ids'])  # 各トークナイゼーション結果からinput_idsを結合

print(all_input_ids)  # 結合されたinput_idsのリストを表示