# Poysuwop_Nr1 / Preprocess
Preprocess Ainu texts and create the bilingual (ain/jpn) corpus

### 0. Library install

In [None]:
# Library
import glob
import json
import re
import collections
import os
import sys

# Change Current Directory
os.chdir('/content/drive/MyDrive/Colab Notebooks/Poysuwop')

# Load preprocess module
#sys.path.append('/content/drive/MyDrive/Colab Notebooks/Poysuwop')
from modules import ainPreprocess

#### Test preprocess module

In [None]:


# -- ex. --
sentence = 'an=an'
print(sentence)
print(ainPreprocess.preprocess(sentence))

# -- ex. --
sentence = 'ohonno somo unukar=an'
print(sentence)
print(ainPreprocess.preprocess(sentence))

# -- ex. --
# sentence = 'ク・ネㇷ゚キ　ヒ　カ　エン・ココパンパ　コㇿカ　ネ　ワ　アンペ　カ　ケシㇼキラㇷ゚　ワ　ク・ネㇷ゚キ　ルスイ　ワ　ケシㇼキラㇷ゚　ワ　クス　（ク・ネㇷ゚キ）　チセ　ソイ　ペカ　クネㇷ゚キ　コㇿ　カン。'
# sentence = 'ペウレクﾙ　ネ　コﾛカ　ア・アイヌコﾛ'
sentence = '“Shirokanipe ranran pishkan, konkanipe ranran pishkan.” arian rekpo chiki kane petesoro sapash aine, ainukotan enkashike chikush kor shichorpokun inkarash ko teeta wenkur tane nishpa ne, teeta nishpa tane wenkur ne kotom shiran.'
print(sentence)
print(ainPreprocess.preprocess(sentence))

# -- ex. --
sentence = 'vv yayán ainu ku=ne ruwe ne korka, ainu itak ani Transformer[123] ku=kor_ rushuy un!'
print(sentence)
print(ainPreprocess.preprocess(sentence))

an=an
an þan
ohonno somo unukar=an
ohonno somo unukar þan
“Shirokanipe ranran pishkan, konkanipe ranran pishkan.” arian rekpo chiki kane petesoro sapash aine, ainukotan enkashike chikush kor shichorpokun inkarash ko teeta wenkur tane nishpa ne, teeta nishpa tane wenkur ne kotom shiran.
sirokanipe ranran piskan konkanipe ranran piskan arian rekpo ciki kane petesoro sapas aine ainukotan enkasike cikus kor sicorpokun inkaras ko teeta wenkur tane nispa ne teeta nispa tane wenkur ne kotom siran
vv yayán ainu ku=ne ruwe ne korka, ainu itak ani Transformer[123] ku=kor_ rushuy un!
yayan ainu kuþ ne ruwe ne korka ainu itak ani transformer kuþ kor rusuy un


### Load Dataset

In [None]:
# Fetch sentence using the 'ain' tag from .json file
# ['ain']       romanized Ainu sentence
# ['ain-kana']  Ainu sentence transcribed in Ainu Kana
# ['jpn']       Japanese sentence

def alphabetCheck(text):
    for char in text:
        if not (char.isascii() and (char.isalpha() or char.isspace() or 'þ' or "'")):
            return False
    return True

corpus_texts = []
ain_raw_texts = []
ain_preprocessed_texts = []
jpn_texts = []
verbalslip_pattern = r'(^| [a-zA-Z]*)(\.|x|X){2,}'

for f in glob.glob('/content/drive/MyDrive/Colab Notebooks/Poysuwop/corpus_json/*.json'):

    # load .json
    data = json.load(open(str(f)))

    for key, value in data.items():

        if key != 'code' and key != 'title':

            ain_sentence = value['ain'] if 'ain' in value else ''
            jpn_sentence = value['jpn'] if 'jpn' in value else ''

            if len(ain_sentence) > 0 and len(jpn_sentence) > 0 and alphabetCheck(ain_sentence) == True:

                # Verbal slip: Words with slips should be removed, since it may produce non-existing words
                # This procedure should be omitted from input to Transformer model in order not to create a non-sentence
                if re.search(verbalslip_pattern, ain_sentence):
                    ain_sentence = re.sub(verbalslip_pattern, ' ', ain_sentence)
                    ain_sentence = ' '.join(ain_sentence.split())

                new_ain_sentence = ainPreprocess.preprocess(ain_sentence)

                corpus_texts.append(new_ain_sentence + '\t' + jpn_sentence)
                ain_preprocessed_texts.append(new_ain_sentence)
                jpn_texts.append(jpn_sentence)
                ain_raw_texts.append(ain_sentence)

In [None]:
def check_TBRcharacters(texts):

    ##### Check charcters used in the dataset
    charlist = [char for text in texts for char in list(text)]
    charlist = [k for k, v in collections.Counter(charlist).items() if v > 1]
    charlist_TBR = ''.join(str(x) for x in charlist)
    char_TBR = re.sub(r"[a-zÞA-Z0-9 =]","",charlist_TBR)
    print("Charcaters to be removed: {0}".format(char_TBR))


# check characters to be removed
check_TBRcharacters(ain_raw_texts)

Charcaters to be removed: .,'?()_!/"-:;[]


In [None]:
with open('poysuwop_corpus.txt', 'w', encoding='utf-8') as file:
    for i in range(len(corpus_texts)):
        file.write(corpus_texts[i] + '\n')

with open('poysuwop_ain.txt', 'w', encoding='utf-8') as file:
    for i in range(len(ain_preprocessed_texts)):
        file.write(ain_preprocessed_texts[i] + '\n')

with open('poysuwop_ain_raw.txt', 'w', encoding='utf-8') as file:
    for i in range(len(ain_raw_texts)):
        file.write(ain_raw_texts[i] + '\n')

with open('poysuwop_jpn.txt', 'w', encoding='utf-8') as file:
    for i in range(len(jpn_texts)):
        file.write(jpn_texts[i] + '\n')

#### Check

In [None]:
# -- ex. --
print(corpus_texts[1])
#print(preprocessed_texts[1])

kþ erampewtek	私はわからない


#### Affix Marker Check

In [None]:
# 人称接辞マーカー = の両端に文字が残っているものの抽出
for i in range(len(texts)):
    if re.search(r'\S+=\S+', texts[i]):
        print(texts[i])

NameError: name 'texts' is not defined

In [None]:
for i in range(len(texts)):
    if re.search(r'[lqvx]', texts[i]):
        print(texts[i])