In [3]:
import io
import icu
import json

databases = [
    "Unihan_DictionaryIndices",
    "Unihan_DictionaryLikeData",
    "Unihan_IRGSources",
    "Unihan_NumericValues",
    "Unihan_OtherMappings",
    "Unihan_RadicalStrokeCounts",
    "Unihan_Readings",
    "Unihan_Variants"
]
normalPUAs = [
    0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14, 0xFA1F,
    0xFA21, 0xFA23, 0xFA24, 0xFA27, 0xFA28, 0xFA29
]

# convert U+... to a real character
def ucode_to_char(ucode):
    if ucode.startswith("U+"):
        # strip comment after wanted value
        if (lt := ucode.find("<")) == -1:
            return chr(int(ucode[2:], 16))
        else:
            return chr(int(ucode[2:lt], 16)) + ucode[lt:]
    else:
        return ucode

for database in databases:
    with io.open("rawdata/" + database + ".txt", "r", encoding = "utf8") as f:
        lines = f.read().splitlines() 
        f.close()

    dbObj = {}
    for line in lines:
        # ignore blank and comment lines
        if line != "" and not line.startswith("#"):
            tokens = line.split("\t")
            hanChar = ucode_to_char(tokens[0])
            #blockID = icu.Char.ublock_getCode(hanChar)

            # ignore compatibility blocks but prefer 12 normal PUA characters
            #if blockID == getattr(icu.UBlockCode, "CJK_COMPATIBILITY_IDEOGRAPHS") \
            #    or blockID == getattr(icu.UBlockCode, "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT"):
            #    if ord(hanChar) in normalPUAs:
            #        continue

            # it might have multiple values, except definition
            if tokens[1] == "kDefinition":
                values = [tokens[2]]
            else:
                values = tokens[2].split(" ")
                values[:] = [ucode_to_char(value) for value in values]

            # create or update a record
            if hanChar not in dbObj:
                dbObj[hanChar] = {}
            dbObj[hanChar][tokens[1]] = values

    with io.open("data/" + database + ".json", "w", encoding = "utf-8") as j:
        json.dump(dbObj, j, ensure_ascii = False, separators = (",", ":"))
        j.close()