In [20]:
import json

from pathlib import Path
import numpy as np
from tqdm import tqdm
import pickle
import os

root_dir = Path(os.path.abspath("__file__")).parent
dataset_dir = root_dir / "data"
ids_file_dir = dataset_dir / "ids_mod.tsv"
dic_dir = dataset_dir / "dictionary.txt"
pickle_dir = root_dir / "pickle"

print(f"IDS path\t: {ids_file_dir}")
print(f"Dictionary path\t: {dic_dir}")

with open(pickle_dir / 'atoms.pickle', 'rb') as f:
    atoms = pickle.load(f)

with open(pickle_dir / 'nonatoms_decomposed.pickle', 'rb') as f:
    nonatoms_decomposed = pickle.load(f)

print(f'atoms {len(atoms)}')
print(f'nonatoms_decomposed {len(nonatoms_decomposed)}')

atoms_dict = dict(zip(atoms[:,0], atoms[:,2]))
nonatoms_decomposed_dict = dict(zip(nonatoms_decomposed[:,0], nonatoms_decomposed[:,2]))
decomposed_dict = atoms_dict.copy()
decomposed_dict.update(nonatoms_decomposed_dict)

print(f'decomposed {len(decomposed_dict)}')

IDS path	: /home/shane/workspace/radical-clustering/data/ids_mod.tsv
Dictionary path	: /home/shane/workspace/radical-clustering/data/dictionary.txt
atoms 469
nonatoms_decomposed 88480
decomposed 88949


In [21]:
# 'character', 'definition', 'pinyin', 'decomposition', 'etymology', 'radical', 'matches'
# 'character', 'definition', 'pinyin', 'decomposition',              'radical', 'matches'
# 'character',               'pinyin', 'decomposition', 'etymology', 'radical', 'matches'
# 'character',               'pinyin', 'decomposition',              'radical', 'matches'

dict_list = []
keys = ('character', 'definition', 'pinyin', 'decomposition', 'etymology', 'radical', 'matches')
with open(dic_dir) as f:
    for row in f:
        char_info = json.loads(row)
        for key in keys:
            try:
                char_info[key]
            except:
                char_info[key] = None
        dict_list.append(char_info)

print(f'dict {len(dict_list)}')

type_dict = {}

for d in dict_list:
    if d['etymology'] is not None:
        type_name = d['etymology']['type']
        if type_name in type_dict:
            type_dict[type_name] += 1
        else:
            type_dict[type_name] = 1

type_dict['none'] = len(dict_list) - sum(type_dict.values())

print(type_dict)

print("\nexample")
print(dict_list[2])

dict 9574
{'ideographic': 1840, 'pictographic': 227, 'pictophonetic': 6966, 'none': 541}

example
{'character': '⺊', 'pinyin': [], 'decomposition': '⿰丨？', 'etymology': {'type': 'ideographic', 'hint': 'A crack on an oracle bone; compare 卜'}, 'radical': '⺊', 'matches': [[0], None], 'definition': None}


In [22]:
ideo_dict_list   = (d for d in dict_list if d['etymology'] is not None and d['etymology']['type'] == 'ideographic')
picto_dict_list  = (d for d in dict_list if d['etymology'] is not None and d['etymology']['type'] == 'pictographic')
picpho_dict_list = (d for d in dict_list if d['etymology'] is not None and d['etymology']['type'] == 'pictophonetic')

In [23]:
for type in type_dict:
    count = 0
    for d in (d for d in dict_list if d['etymology'] is not None and d['etymology']['type'] == type):
        try:
            decomposed_dict[hex(ord(d['character']))]
            count += 1
        except:
            pass
    print(f'{type} {count}')

ideographic 1840
pictographic 226
pictophonetic 6966
none 0


In [25]:
from functools import reduce

sentences = {}
for type in type_dict:
    if type == 'none':
        continue
    decomposed = {}
    for d in (d for d in dict_list if d['etymology'] is not None and d['etymology']['type'] == type):
        try:
            decomposed[hex(ord(d['character']))] = [hex(ord(c)) for c in decomposed_dict[hex(ord(d['character']))]]
        except:
            pass
    sentences[type] = decomposed

for k in sentences:
    key_length = len(sentences[k])
    unique_length = len(set(reduce(lambda a, b: a+b, sentences.values())))

    f = open(f"radical-{k}", "w")
    f.write(f"*Verticies {key_length+unique_length} {key_length}\n")

    print(key_length, unique_length) 

# f = open("radical-bipartite.net", "w")
# f.write(f"*Verticies {nonatoms_decomposed_size+unique_atoms_size} {nonatoms_decomposed_size}\n")
# for i, v in enumerate(V, start=1):
#     if i <= nonatoms_decomposed_size:
#         f.write(f"{i}\t\"{v}\"\t0.0\t0.0\t0.0\tellipse\tic\tRed")
#     else:
#         unique_atoms_enum_dict[v] = i
#         f.write(f"{i}\t\"{v}\"\t0.0\t0.0\t0.0\tbox\tic\tGreen")
#     f.write("\n")

# f.write("*Arcs\n*Edges\n")
# for i, ids in enumerate(nonatoms_decomposed[:,2], start=1):
#     for c in ids:
#         f.write(f"{i}\t{unique_atoms_enum_dict[hex(ord(c))]}\t1\n")

# f.close()


['0x2e8a', '0x2ff1', '0x2ff1', '0x4e36', '0x4e00', '0x2ff4', '0x56d7', '0x53e3', '0x2ff1', '0x5eff', '0x2ffb', '0x5dfe', '0x2ff0', '0x5165', '0x5165', '0x2ff0', '0x2ff1', '0x2ff1', '0x2e8a', '0x2ff5', '0x5182', '0x4e00', '0x8c37', '0x53c8', '0x2ff1', '0x2ff1', '0x65e5', '0x4e00', '0x5bf8', '0x2ff1', '0x2ffa', '0x200ca', '0x2ff0', '0x2ff0', '0x2461', '0x4e36', '0x2ff0', '0x2461', '0x4e36', '0x2ffa', '0x200ca', '0x2ff0', '0x2ff0', '0x2461', '0x4e36', '0x2ff0', '0x2461', '0x4e36', '0x2ff8', '0x5e7f', '0x2ff1', '0x2ffb', '0x7532', '0x4e00', '0x4e00', '0x2ffa', '0x5ef4', '0x2462', '0x2ff1', '0x2ff1', '0x767d', '0x5c0f', '0x5f61', '0x2ff0', '0x624c', '0x2ff7', '0x531a', '0x5dfe', '0x2ff1', '0x65e5', '0x2ff1', '0x2ff0', '0x2ff0', '0x2461', '0x4e36', '0x2ff0', '0x2461', '0x4e36', '0x706c', '0x2ff0', '0x74dc', '0x74dc', '0x2ff2', '0x2ff1', '0x2ff0', '0x2461', '0x4e36', '0x5c0f', '0x8a00', '0x2ff1', '0x2ff0', '0x2461', '0x4e36', '0x5c0f', '0x4e00', '0x2ff1', '0x4e28', '0x4e00', '0x2ff1', '0x4e00

TypeError: unsupported operand type(s) for +: 'dict' and 'dict'