In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# !huggingface-cli login

In [3]:
from typing import cast
import datasets

dataset = cast(datasets.DatasetDict, datasets.load_dataset("aynumosir/ainu-corpora"))

Using the latest cached version of the dataset since aynumosir/ainu-corpora couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/mkpoli/.cache/huggingface/datasets/aynumosir___ainu-corpora/default/0.0.0/81c26c21bbd662ae55371f334e0796a99c8b3d61 (last modified on Sat Dec 28 16:27:18 2024).


In [4]:
import json

from collections import defaultdict
from itertools import chain
from typing import cast, TypedDict
from utils.corpus import CorpusItem
from utils.lemmatize import lemmatize
from utils.tokenize import tokenize
from tqdm.notebook import tqdm

class Text(TypedDict):
    book: str
    title: str
    url: str
    pronoun: str
    author: str
    dialect: str
    text: str
    translation: str
    label: str

sentences_by_book: defaultdict[str, list[CorpusItem]] = defaultdict(list)

with open("../dictionary/output/combined_part_of_speech.json", "r") as f:
    part_of_speech = json.load(f)

with open("../dictionary/output/combined_glosses.json", "r") as f:
    glosses = json.load(f)

data = list(chain(dataset["test"], dataset["train"]))

for text in tqdm(data):
    text = cast(Text, text)
    words = tokenize(text["text"])

    poses: list[list[str]] = [part_of_speech.get(w, []) for w in words]

    lemmatized: list[list[tuple[str, dict[str, str]]]] = [
        [lemmatize(w, p) for p in ps] for w, ps in zip(words, poses) for p in ps
    ]

    sentences_by_book[text["book"]].append(
        {
            "translation": text["translation"],
            "sentence": text["text"],
            "words": words,
            "lemmas": [[lemma for lemma, _ in lemmas] for lemmas in lemmatized],
            "features": [
                [features for _, features in lemmas] for lemmas in lemmatized
            ],
            "part_of_speech": poses,
            "glosses": [glosses.get(w, []) for w in words],
        }
    )

sentences_by_book.keys()

  0%|          | 0/176935 [00:00<?, ?it/s]

dict_keys(['アイヌ語アーカイブ', 'アイヌ語アーカイブ音声資料', 'アイヌ民族文化センター研究紀要', 'アイヌ神謡集', 'アイヌ語鵡川方言日本語‐アイヌ語辞典', '平取町アイヌ口承文芸', 'アイヌタイムズ', 'AA研アイヌ語資料', 'アイヌ語口承文芸コーパス', 'アイヌ語會話字典', '鍋沢元蔵筆録ノート', 'しずないのアイヌご（入門）', 'カムイユカㇻを聞いてアイヌ語を学ぶ', '浅井タケ昔話全集I,II', '北海道立アイヌ民族文化研究センター紀要', 'アイヌ語・アイヌ文化研究の課題', '幌別のアイヌ語（中級）', 'アイヌ語音声資料', '千歳のアイヌ語（中級）', '千徳太郎治のピウスツキ宛書簡', 'からふとのアイヌご（入門）', '石狩川のアイヌ語（中級）', '十勝のアイヌ語（中級）', '沙流のアイヌ語（中級）', 'アイヌ口承文芸テキスト集', '美幌のアイヌ語（中級）', 'いしかりがわのアイヌご（入門）', '千歳のアイヌ語（初級）', '幌別のアイヌ語（初級）', '十勝のアイヌ語（初級）', 'ニューエクスプレスプラス アイヌ語', 'アイヌ語ラジオ講座テキスト', '白沢ナベと小田イトの会話', 'びほろのあいぬご（入門）', 'カラフトのアイヌ語（初級）', 'アイヌ語復興に関わる諸問題', 'とかちのアイヌご（入門）', '第27回アイヌ語弁論大会', 'アコㇿイタㇰ', '美幌のアイヌ語（初級）', 'プラハ宣言', 'さるのアイヌご（入門）', 'ほろべつのアイヌご（入門）', '石狩川のアイヌ語（初級）', '沙流のアイヌ語（初級）', 'カラフトのアイヌ語（中級）', 'ウポポイ館内展示', '静内のアイヌ語（中級）', '静内のアイヌ語（初級）', 'ニューエクスプレス・スペシャル 日本語の隣人たち I+II', '萱野茂の国会演説', 'ちとせのアイヌご（入門）', 'ピウスツキ記念碑'])

In [5]:
sentences = print(sum(len(v) for v in sentences_by_book.values()))

176935


In [6]:
for sentence in (
    sentences_by_book["萱野茂の国会演説"][0:3]
    + sentences_by_book["平取町アイヌ口承文芸"][-3:]
    + sentences_by_book["AA研アイヌ語資料"][55 : 55 + 3]
    + sentences_by_book["沙流のアイヌ語（中級）"][10:13]
):
    print("sentence: ", sentence["sentence"])
    print("translation: ", sentence["translation"])
    print("words: ", sentence["words"])
    print("lemmas: ", sentence["lemmas"])
    print("features: ", sentence["features"])
    print("part of speech: ", sentence["part_of_speech"])
    print("glosses: ", sentence["glosses"])
    print("-" * 100)

sentence:  ponno ne kusu ci=koykokanu wa un=kore yan.
translation:  少しですので、私のアイヌ語にお耳を傾けてくださいますようお願い申し上げる次第です。
words:  ['ponno', 'ne', 'kusu', 'ci=', 'koykokanu', 'wa', 'un=', 'kore', 'yan', '.']
lemmas:  [['ne', 'ne', 'ne', 'ne', 'ne'], ['ne', 'ne', 'ne', 'ne', 'ne'], ['ne', 'ne', 'ne', 'ne', 'ne'], ['ne', 'ne', 'ne', 'ne', 'ne'], ['ne', 'ne', 'ne', 'ne', 'ne'], ['kusu', 'kusu'], ['kusu', 'kusu'], ['ci=', 'ci='], ['ci=', 'ci='], ['wa'], ['un='], ['kore'], ['yan']]
features:  [[{}, {'Valency': '-1'}, {}, {'Valency': '0'}, {}], [{}, {'Valency': '-1'}, {}, {'Valency': '0'}, {}], [{}, {'Valency': '-1'}, {}, {'Valency': '0'}, {}], [{}, {'Valency': '-1'}, {}, {'Valency': '0'}, {}], [{}, {'Valency': '-1'}, {}, {'Valency': '0'}, {}], [{'Valency': '0'}, {'Valency': '0'}], [{'Valency': '0'}, {'Valency': '0'}], [{'Person': '1', 'Number': 'Sing', 'Clusivity': 'Ex', 'Case': 'Nom', 'Valency': '-1'}, {'Person': '1', 'Number': 'Sing', 'Clusivity': 'Ex', 'Case': 'Nom', 'Valency': '-1'}], [{'Person': '1

In [7]:
import pickle

with open("output/annotated_translated_tokenized_corpus_by_book.pkl", "wb") as f:
    pickle.dump(sentences_by_book, f)

## Extract words

In [8]:
import pickle
import regex as re
from utils.corpus import CorpusItem

with open("output/annotated_translated_tokenized_corpus_by_book.pkl", "rb") as file:
    corpus: dict[str, list[CorpusItem]] = pickle.load(file)

In [9]:
# Create dictionary of words by book
from utils.tokenize import is_word
words_by_book = {}
for book, sentences in corpus.items():
    words_by_book[book] = [
        word
        for sentence in sentences
        for word in sentence["words"]
        if is_word(word)
    ]

# Create combined words from all books
combined_words = [
    word
    for words in words_by_book.values()
    for word in words
]

SAKHALIN_BOOKS = {
    "からふとのアイヌご（入門）",
    "カラフトのアイヌ語（中級）", 
    "カラフトのアイヌ語（初級）",
    "ニューエクスプレス・スペシャル 日本語の隣人たち I+II",
    "ピウスツキ記念碑",
    "千徳太郎治のピウスツキ宛書簡",
    "浅井タケ昔話全集I,II",
    "アイヌ語・アイヌ文化研究の課題",
}

MODERN_WORDS = {"アイヌタイムズ", "プラハ宣言", "ウポポイ館内展示", "萱野茂の国会演説"}

# Create combined words from traditional Hokkaido books only
combined_words_hokkaido_traditional = [
    word
    for book, words in words_by_book.items()
    for word in words
    if book not in SAKHALIN_BOOKS and book not in MODERN_WORDS
]

In [10]:
from collections import Counter

word_counts = Counter(combined_words)

with open("output/ainu_words_all.tsv", "w") as f:
    for word, count in word_counts.most_common():
        f.write(f"{word}\t{count}\n")
with open("output/ainu_words_hokkaido_traditional.tsv", "w") as f:
    for word, count in word_counts.most_common():
        f.write(f"{word}\t{count}\n")

In [11]:
from pathlib import Path

WORDS_BY_BOOK_DIR = Path("output") / "words_by_book"
WORDS_BY_BOOK_DIR.mkdir(parents=True, exist_ok=True)

for book, sentences in corpus.items():
    flattened_words = [word for sentence in sentences for word in sentence["words"]]
    word_counts = Counter(flattened_words)
    with open(WORDS_BY_BOOK_DIR / f"{book}.tsv", "w") as f:
        for word, count in word_counts.most_common():
            f.write(f"{word}\t{count}\n")

In [14]:
from utils.tokenize import is_word

lemmas_by_book = {}
for book, sentences in corpus.items():
    lemmas_by_book[book] = [
        lemma[0]
        for sentence in sentences
        for lemma in sentence["lemmas"]
        if is_word(lemma[0])
        if lemma
    ]

combined_lemmas = [
    lemma for lemmas in lemmas_by_book.values() for lemma in lemmas
]

combined_lemmas_hokkaido_traditional = [
    lemma for book, lemmas in lemmas_by_book.items() for lemma in lemmas
    if book not in SAKHALIN_BOOKS and book not in MODERN_WORDS
]

with open("output/ainu_lemmas_all.tsv", "w") as f:
    for lemma, count in Counter(combined_lemmas).most_common():
        f.write(f"{lemma}\t{count}\n")

with open("output/ainu_lemmas_hokkaido_traditional.tsv", "w") as f:
    for lemma, count in Counter(combined_lemmas_hokkaido_traditional).most_common():
        f.write(f"{lemma}\t{count}\n")