In [1]:
import pickle

from utils.corpus import RawSentence

with open("output/annotated_translated_tokenized_corpus_by_book.pkl", "rb") as file:
    corpus: dict[str, list[RawSentence]] = pickle.load(file)

combined_words = [
    word
    for _, sentences in corpus.items()
    for sentence in sentences
    for word in sentence["words"]
]
print(f"Total tokens: {len(combined_words)}")
print(f"Total types: {len(set(combined_words))}")
print()

for book, sentences in corpus.items():
    print(f"Book: {book}")

    flattened_words = [word for sentence in sentences for word in sentence["words"]]
    print(f"Tokens: {len(flattened_words)}")
    print(f"Types: {len(set(flattened_words))}")
    print()

Total tokens: 1136193
Total types: 33607

Book: アイヌ語鵡川方言日本語‐アイヌ語辞典
Tokens: 54828
Types: 5239

Book: アイヌ語アーカイブ
Tokens: 505373
Types: 11361

Book: アイヌタイムズ
Tokens: 122271
Types: 9425

Book: 平取町アイヌ口承文芸
Tokens: 125787
Types: 6709

Book: アイヌ語アーカイブ音声資料
Tokens: 30357
Types: 2804

Book: AA研アイヌ語資料
Tokens: 87673
Types: 5292

Book: 浅井タケ昔話全集I,II
Tokens: 51195
Types: 3782

Book: 石狩川のアイヌ語（中級）
Tokens: 1175
Types: 418

Book: アイヌ語會話字典
Tokens: 11681
Types: 2222

Book: アイヌ語口承文芸コーパス
Tokens: 63453
Types: 3129

Book: 石狩川のアイヌ語（初級）
Tokens: 858
Types: 342

Book: 十勝のアイヌ語（初級）
Tokens: 616
Types: 250

Book: アイヌ語ラジオ講座テキスト
Tokens: 10284
Types: 1522

Book: アイヌ民族文化センター研究紀要
Tokens: 14877
Types: 1527

Book: アイヌ語音声資料
Tokens: 1321
Types: 544

Book: 十勝のアイヌ語（中級）
Tokens: 1235
Types: 387

Book: 沙流のアイヌ語（初級）
Tokens: 505
Types: 219

Book: カラフトのアイヌ語（中級）
Tokens: 1714
Types: 563

Book: 美幌のアイヌ語（初級）
Tokens: 1599
Types: 356

Book: 鍋沢元蔵筆録ノート
Tokens: 14444
Types: 2155

Book: アコㇿイタㇰ
Tokens: 1600
Types: 343

Book: 千徳太郎治のピウスツキ宛書簡
Tokens: 18

In [3]:
from collections import Counter

word_counts = Counter(combined_words)

with open("output/ainu_words_all.tsv", "w") as f:
    for word, count in word_counts.most_common():
        f.write(f"{word}\t{count}\n")

In [4]:
from pathlib import Path
WORDS_BY_BOOK_DIR = Path('output') / 'words_by_book'
WORDS_BY_BOOK_DIR.mkdir(parents=True, exist_ok=True)

for book, sentences in corpus.items():
    flattened_words = [word for sentence in sentences for word in sentence["words"]]
    word_counts = Counter(flattened_words)
    with open(WORDS_BY_BOOK_DIR / f"{book}.tsv", "w") as f:
        for word, count in word_counts.most_common():
            f.write(f"{word}\t{count}\n")
