In [8]:
import json

from pathlib import Path
import numpy as np
from tqdm import tqdm
import pickle
import os

root_dir = Path(os.path.abspath("__file__")).parent
dataset_dir = root_dir / "data"
ids_file_dir = dataset_dir / "ids_mod.tsv"
dic_dir = dataset_dir / "dictionary.txt"
pickle_dir = root_dir / "pickle"

print(f"IDS path\t: {ids_file_dir}")
print(f"Dictionary path\t: {dic_dir}")

with open(pickle_dir / 'atoms.pickle', 'rb') as f:
    atoms = pickle.load(f)

with open(pickle_dir / 'nonatoms_decomposed.pickle', 'rb') as f:
    nonatoms_decomposed = pickle.load(f)

print(f'atoms {len(atoms)}')
print(f'nonatoms_decomposed {len(nonatoms_decomposed)}')

atoms_dict = dict(zip(atoms[:,0], atoms[:,2]))
nonatoms_decomposed_dict = dict(zip(nonatoms_decomposed[:,0], nonatoms_decomposed[:,2]))
decomposed_dict = atoms_dict.copy()
decomposed_dict.update(nonatoms_decomposed_dict)

print(f'decomposed {len(decomposed_dict)}')

IDS path	: /home/shaneoh/workspace/radical-clustering/data/ids_mod.tsv
Dictionary path	: /home/shaneoh/workspace/radical-clustering/data/dictionary.txt
atoms 469
nonatoms_decomposed 88480
decomposed 88949


In [9]:
# 'character', 'definition', 'pinyin', 'decomposition', 'etymology', 'radical', 'matches'
# 'character', 'definition', 'pinyin', 'decomposition',              'radical', 'matches'
# 'character',               'pinyin', 'decomposition', 'etymology', 'radical', 'matches'
# 'character',               'pinyin', 'decomposition',              'radical', 'matches'

dict_list = []
keys = ('character', 'definition', 'pinyin', 'decomposition', 'etymology', 'radical', 'matches')
with open(dic_dir) as f:
    for row in f:
        char_info = json.loads(row)
        for key in keys:
            try:
                char_info[key]
            except:
                char_info[key] = None
        dict_list.append(char_info)

print(f'dict {len(dict_list)}')

type_dict = {}

for d in dict_list:
    if d['etymology'] is not None:
        type_name = d['etymology']['type']
        if type_name in type_dict:
            type_dict[type_name] += 1
        else:
            type_dict[type_name] = 1

type_dict['none'] = len(dict_list) - sum(type_dict.values())

print(type_dict)

print("\nexample")
print(dict_list[2])

dict 9574
{'ideographic': 1840, 'pictographic': 227, 'pictophonetic': 6966, 'none': 541}

example
{'character': '⺊', 'pinyin': [], 'decomposition': '⿰丨？', 'etymology': {'type': 'ideographic', 'hint': 'A crack on an oracle bone; compare 卜'}, 'radical': '⺊', 'matches': [[0], None], 'definition': None}


In [10]:
ideo_dict_list   = (d for d in dict_list if d['etymology'] is not None and d['etymology']['type'] == 'ideographic')
picto_dict_list  = (d for d in dict_list if d['etymology'] is not None and d['etymology']['type'] == 'pictographic')
picpho_dict_list = (d for d in dict_list if d['etymology'] is not None and d['etymology']['type'] == 'pictophonetic')

In [13]:
for type in type_dict:
    count = 0
    for d in (d for d in dict_list if d['etymology'] is not None and d['etymology']['type'] == type):
        try:
            decomposed_dict[hex(ord(d['character']))]
            count += 1
        except:
            pass
    print(f'{type} {count}')

ideographic 1840
pictographic 226
pictophonetic 6966
none 0
