# Generate combined part of speech data

In [4]:
from pathlib import Path
import json
from typing import TypedDict

OUTPUT_DIR = Path("output")
INPUT_DIR = Path("input")

## Load data

### Load Wiktionary POS data

In [5]:
# Load Wiktionary POS data
with open(OUTPUT_DIR / "wiktionary_ainu_part_of_speech.json", "r") as f:
    wiktionary_pos = json.load(f)

print("Wiktionary entries:", len(wiktionary_pos))
for i, (lemma, pos) in enumerate(wiktionary_pos.items()):
    print(f"{lemma}\t{pos}")
    if i > 10:
        break

Wiktionary entries: 2229
wan	['num']
tu	['num']
rak	['verb']
ci	['verb']
mi	['verb']
on	['verb']
ona	['noun']
o	['verb']
ay	['noun']
oro	['noun']
he	['parti']
i	['prefix']


### Load FF Ainu POS data

In [6]:
# Load FF Ainu POS data
with open(OUTPUT_DIR / "ff-ainu-saru-terms.json", "r") as f:
    ff_saru_terms = json.load(f)

print("FF Saru entries:", len(ff_saru_terms))
for i, entry in enumerate(ff_saru_terms):
    print(f"{entry['kana']}\t{entry['pos']}")
    if i > 10:
        break

FF Saru entries: 975
ア	人接
ア	自
アアン	助動
アイヌ	名
アイヌイタク	名
アイヌフラ	名
アイネ	接助
アエプ	名
アオイペプ	名
アオカ	代名
アキヒ	名
アク	名


## Load manual POS data

In [7]:
with open(INPUT_DIR / "manual_gloss.json", "r") as f:
    manual_glosses = {k: v["poses"] for k, v in json.load(f).items()}

print("Manual:", len(manual_glosses))
for i, (lemma, pos) in enumerate(manual_glosses.items()):
    print(f"{lemma}\t{pos}")
    if i > 10:
        break

Manual: 2
k=	['pers']
c=	['pers']


## Combine POS data

In [8]:
# Map FF Ainu POS tags to Wiktionary format
POS_MAP = {
    "名": "noun",
    "自": "verb", 
    "他": "verb",
    "複他": "verb",
    "完": "verb",
    "助動": "auxverb",
    "副": "adv",
    "後副": "postpadv",
    "連体": "adnom",
    "間投": "interj",
    "形名": "nmlz",
    "代名": "pronoun",
    "位名": "noun",
    "格助": "parti",
    "副助": "parti",
    "終助": "parti",
    "接助": "parti",
    "人接": "pers",
    "接頭": "prefix",
    "接尾": "suffix",
    "位名＋格助": "colloc",
    "名＋格助": "colloc",
    "動": "verb"
}

# Combine POS data
combined_pos = {}

# Add Wiktionary entries
for lemma, pos_list in wiktionary_pos.items():
    combined_pos[lemma] = pos_list

# Add FF Saru entries
for entry in ff_saru_terms:
    if entry["kana"] in combined_pos and combined_pos[entry["kana"]]:
        continue

    mapped_poses = []
    for pos in entry["pos"].split("／"):
        mapped_pos = POS_MAP.get(pos)
        if mapped_pos:
            mapped_poses.append(mapped_pos)
        else:
            print(f"No mapping found for {entry['kana']} with POS {entry['pos']}")

    combined_pos[entry["kana"]] = mapped_poses

print("Combined entries:", len(combined_pos))
for i, (lemma, pos) in enumerate(combined_pos.items()):
    print(f"{lemma}\t{pos}")
    if i > 10:
        break

Combined entries: 3186
wan	['num']
tu	['num']
rak	['verb']
ci	['verb']
mi	['verb']
on	['verb']
ona	['noun']
o	['verb']
ay	['noun']
oro	['noun']
he	['parti']
i	['prefix']


In [9]:
# Save combined POS data
with open(OUTPUT_DIR / "combined_part_of_speech.json", "w") as f:
    json.dump(combined_pos, f, ensure_ascii=False, indent=2)