In [1]:
from pathlib import Path
import numpy as np
from tqdm import tqdm
import pickle
import os

root_dir = Path(os.path.abspath("__file__")).parent
dataset_dir = root_dir / "data"
ids_file_dir = dataset_dir / "ids_mod.tsv"
dic_dir = dataset_dir / "cedict_ts.u8"
pickle_dir = root_dir / "pickle"

print(f"IDS path\t: {ids_file_dir}")
print(f"Dictionary path\t: {dic_dir}")

IDS path	: /home/shane/workspace/radical-clustering/data/ids_mod.tsv
Dictionary path	: /home/shane/workspace/radical-clustering/data/cedict_ts.u8


In [2]:
# # ⿰ 	U+2FF0 	Ideographic description character left to right
# # ⿱ 	U+2FF1 	Ideographic description character above to below
# # ⿲ 	U+2FF2 	Ideographic description character left to middle and right
# # ⿳ 	U+2FF3 	Ideographic description character above to middle and below
# # ⿴ 	U+2FF4 	Ideographic description character full surround
# # ⿵ 	U+2FF5 	Ideographic description character surround from above
# # ⿶ 	U+2FF6 	Ideographic description character surround from below
# # ⿷ 	U+2FF7 	Ideographic description character surround from left
# # ⿸ 	U+2FF8 	Ideographic description character surround from upper left
# # ⿹ 	U+2FF9 	Ideographic description character surround from upper right
# # ⿺ 	U+2FFA 	Ideographic description character surround from lower left
# # ⿻ 	U+2FFB 	Ideographic description character overlaid

tokens = [
    ['U+2FF0', '⿰', '⿰'],
    ['U+2FF1', '⿱', '⿱'],
    ['U+2FF2', '⿲', '⿲'],
    ['U+2FF3', '⿳', '⿳'],
    ['U+2FF4', '⿴', '⿴'],
    ['U+2FF5', '⿵', '⿵'],
    ['U+2FF6', '⿶', '⿶'],
    ['U+2FF7', '⿷', '⿷'],
    ['U+2FF8', '⿸', '⿸'],
    ['U+2FF9', '⿹', '⿹'],
    ['U+2FFA', '⿺', '⿺'],
    ['U+2FFB', '⿻', '⿻'],
]

tokens = np.array(tokens, dtype='object')

ids_file = open(ids_file_dir, 'r')
ids_file = ids_file.readlines()[2:]
ids_file = [row[:-1].split('\t') for row in ids_file]

temp_ids_file = []
for row in ids_file:
    temp_ids_file.append(row[:3])
ids_file = np.array(temp_ids_file, dtype='object')

tokens[:,0] = list(map(lambda x: '0x' + x[2:].lower(), tokens[:,0]))
ids_file[:,0] = list(map(lambda x: '0x' + x[2:].lower(), ids_file[:,0]))

atoms = []
nonatoms = []
for row in ids_file:
    if len(set(row[2]) & set(tokens[:,2])) == 0:
        atoms.append(row)
    else:
        row[2] = row[2].split('[')[0]
        nonatoms.append(row)
atoms = np.array(atoms)
atoms = np.concatenate((atoms, tokens), axis=0)
nonatoms = np.array(nonatoms)

with open(pickle_dir / 'atoms.pickle', 'wb') as f:
    pickle.dump(atoms, f)

with open(pickle_dir / 'nonatoms.pickle', 'wb') as f:
    pickle.dump(nonatoms, f)

print(f"atoms: {len(atoms)}")
print(f"nonatoms: {len(nonatoms)}")

atoms

atoms: 469
nonatoms: 88480


array([['0x3b1', 'α', 'α'],
       ['0x2113', 'ℓ', 'ℓ'],
       ['0x2460', '①', '①'],
       ...,
       ['0x2ff9', '⿹', '⿹'],
       ['0x2ffa', '⿺', '⿺'],
       ['0x2ffb', '⿻', '⿻']], dtype=object)

In [3]:
def decompose(sequence):
    decomposed_sequence = ""
    for c in sequence:
        decomposed = False
        for row in nonatoms:
            if c == row[1]:
                decomposed_sequence += decompose(row[2])
                decomposed = True
                break
        if not decomposed:
            decomposed_sequence += c
    return decomposed_sequence

for i in tqdm(range(len(nonatoms))):
    nonatoms[i][2] = decompose(nonatoms[i][2])

  0%|          | 82/88480 [00:05<1:38:19, 14.98it/s]

In [None]:
with open(pickle_dir / 'nonatoms_decomposed.pickle', 'wb') as f:
    pickle.dump(nonatoms, f)

In [None]:
# with open('atoms.pickle', 'wb') as f:
#     pickle.dump(atoms, f)

In [None]:
# with open('decomposed.pickle', 'rb') as f:
#     decomposed = pickle.load(f)