# Generate combined part of speech data

In [1]:
from pathlib import Path
import json
from typing import TypedDict

OUTPUT_DIR = Path("output")
INPUT_DIR = Path("input")

# Expand more pos data, from wiktionary by ain-verb template and from Tamurax

## Load data

### Load Wiktionary POS data

In [2]:
from collections import Counter

# Load Wiktionary POS data
with open(OUTPUT_DIR / "wiktionary_ainu_part_of_speech.json", "r") as f:
    wiktionary_pos = json.load(f)

# TODO: Relfect transitivity

# noun: 1415
# verb: 612
# adv: 81
# parti: 40
# num: 34
# suffix: 27
# prefix: 26
# pron: 22
# interj: 22
# auxverb: 17
# root: 13
# conj: 11
# adnom: 8
# pronoun: 2
# postpadv: 2
# colloc: 2
# determiner: 1
# rel: 1

WIKTIONARY_XPOS_MAP = {
    "noun": "n",
    "verb": "v",
    "conj": "cconj",
    "pron": "pron",
    "adv": "adv",
    "parti": "parti",
    "num": "num",
    "suffix": "sfx",
    "prefix": "pfx",
    "pron": "pron",
    "interj": "intj",
    "auxverb": "auxv",
    "root": "root",
    "adnom": "adn",
    "postpadv": "postp",
    "colloc": "colloc",
    "determiner": "adn",
    "rel": "rel",

}

c = Counter()
for word, pos in wiktionary_pos.items():
    for p in pos:
        c[p] += 1

for pos, count in c.most_common():
    print(f"{pos}: {count}")

print("Wiktionary entries:", len(wiktionary_pos))
for i, (lemma, pos) in enumerate(wiktionary_pos.items()):
    print(f"{lemma}\t{pos}")
    if i > 10:
        break

# word -> list of pos
mapped_wiktionary_pos: dict[str, list[str]] = {
    word: [WIKTIONARY_XPOS_MAP[pos] if pos in WIKTIONARY_XPOS_MAP else pos for pos in pos_list]
    for word, pos_list in wiktionary_pos.items()
}


noun: 1415
verb: 612
adv: 81
parti: 40
num: 34
suffix: 27
prefix: 26
pron: 22
interj: 22
auxverb: 17
root: 13
conj: 11
adnom: 8
pronoun: 2
postpadv: 2
colloc: 2
determiner: 1
rel: 1
Wiktionary entries: 2229
wan	['num']
tu	['num']
rak	['verb']
ci	['verb']
mi	['verb']
on	['verb']
ona	['noun']
o	['verb']
ay	['noun']
oro	['noun']
he	['parti']
i	['prefix']


### Load FF Ainu POS data

In [3]:
# Load FF Ainu POS data
with open(OUTPUT_DIR / "ff-ainu-saru-terms.json", "r") as f:
    ff_saru_terms = json.load(f)

print("FF Saru entries:", len(ff_saru_terms))
for i, entry in enumerate(ff_saru_terms):
    print(f"{entry['kana']}\t{entry['pos']}")
    if i > 10:
        break

FF Saru entries: 975
ア	人接
ア	自
アアン	助動
アイヌ	名
アイヌイタク	名
アイヌフラ	名
アイネ	接助
アエプ	名
アオイペプ	名
アオカ	代名
アキヒ	名
アク	名


## Load manual POS data

In [4]:
with open(INPUT_DIR / "manual_gloss.json", "r") as f:
    manual_glosses = {k: v["poses"] for k, v in json.load(f).items()}

print("Manual:", len(manual_glosses))
for i, (lemma, pos) in enumerate(manual_glosses.items()):
    print(f"{lemma}\t{pos}")
    if i > 10:
        break

Manual: 2
k=	['pers']
c=	['pers']


## Combine POS data

In [5]:
# Map FF Ainu POS tags to Wiktionary format
# | UPOS                | XPOS   | JAPANESE |
# | ------------------- | ------ | -------- |
# | VERB                | vi     | 自動詞   |
# | VERB                | vt     | 他動詞   |
# | VERB                | vd     | 複他動詞 |
# | VERB                | vc     | 完全動詞 |
# | VERB                | v      | 動詞     |
# | AUX                 | auxv   | 助動詞   |
# | AUX                 | cop    | 繋辞     |
# | NOUN                | n      | 名詞     |
# | NOUN                | nl     | 位置名詞 |
# | NOUN                | nmlz   | 形式名詞 |
# | PRON                | pron   | 代名詞   |
# | PROPN               | propn  | 固有名詞 |
# | DET                 | adn    | 連体詞   |
# | ADV                 | adv    | 副詞     |
# | CCONJ / SCONJ / ADV | cconj  | 接続詞   |
# | POST                | post   | 助詞     |
# | PART                | sfp    | 終助詞   |
# | PART                | pers   | 人称接辞 |
# | INTJ                | intj   | 間投詞   |
# | SCONJ               | sconj  | 接続助詞 |
# | SCONJ               | padv   | 後置副詞 |
# | -                   | sfx    | 接尾辞   |
# | -                   | pfx    | 接頭辞   |
# | -                   | root   | 語根     |
# | ADP                 | advp   | 副助詞   |
# | ADP                 | postp  | 格助詞   |
# | ADP                 | parti  | 助詞     |
# | PRON / DET / NOUN   | int    | 疑問詞   |
# | NUM                 | num    | 数詞     |
# | PUNCT               | punct  | 記号     |
# | -                   | colloc | 連語     |
# | -                   | idiom  | 慣用句   |


POS_MAP = {
    "名": "n",
    "自": "vi", 
    "他": "vt",
    "複他": "vd",
    "完": "vc",
    "助動": "auxv",
    "繋辞": "cop",
    "副": "adv",
    "後副": "padv",
    "連体": "adn",
    "間投": "intj",
    "形名": "nmlz",
    "代名": "pron",
    "位名": "nl",
    "格助": "postp",
    "副助": "advp",
    "終助": "sfp",
    "接助": "sconj",
    "人接": "pers",
    "接頭": "pfx",
    "接尾": "sfx",
    "位名＋格助": "colloc",
    "名＋格助": "colloc",
    "動": "verb"
}

# Combine POS data
combined_pos = {}

# Add Wiktionary entries
for lemma, pos_list in mapped_wiktionary_pos.items():
    combined_pos[lemma] = pos_list

# Add FF Saru entries
for entry in ff_saru_terms:
    if entry["kana"] in combined_pos and combined_pos[entry["kana"]]:
        continue

    mapped_poses = []
    for pos in entry["pos"].split("／"):
        mapped_pos = POS_MAP.get(pos)
        if mapped_pos:
            mapped_poses.append(mapped_pos)
        else:
            print(f"No mapping found for {entry['kana']} with POS {entry['pos']}")

    combined_pos[entry["kana"]] = mapped_poses

print("Combined entries:", len(combined_pos))
for i, (lemma, pos) in enumerate(combined_pos.items()):
    print(f"{lemma}\t{pos}")
    if i > 10:
        break

Combined entries: 3186
wan	['num']
tu	['num']
rak	['v']
ci	['v']
mi	['v']
on	['v']
ona	['n']
o	['v']
ay	['n']
oro	['n']
he	['parti']
i	['pfx']


In [6]:
# Save combined POS data
with open(OUTPUT_DIR / "combined_part_of_speech.json", "w") as f:
    json.dump(combined_pos, f, ensure_ascii=False, indent=2)

with open("../utils/src/utils/data/combined_part_of_speech.json", "w") as f:
    json.dump(combined_pos, f, ensure_ascii=False, indent=2)


In [7]:
import json
from collections import Counter
with open(OUTPUT_DIR / "combined_part_of_speech.json", "r") as f:
    combined_pos = json.load(f)

c = Counter()
for word, poses in combined_pos.items():
    for pos in poses:
        c[pos] += 1

for word, count in c.most_common():
    print(f"{word}: {count}")


n: 1840
v: 612
vi: 183
adv: 133
vt: 123
parti: 40
num: 34
pron: 34
adn: 31
auxv: 30
intj: 29
nl: 29
pfx: 28
sfx: 27
sconj: 19
advp: 14
root: 13
padv: 12
cconj: 11
vd: 10
pers: 9
postp: 7
vc: 7
sfp: 6
colloc: 5
nmlz: 5
pronoun: 2
rel: 1
verb: 1
