### Build vocab for THK MTH datasets

In [None]:
path = 'datasets/tkh-mth2k2'

import glob
vocab = set()

for label_path in glob.glob(f'{path}/**/*.txt', recursive=True):
    if 'label_textline' in label_path:
        with open(label_path, 'r') as f:
            for line in f:
                vocab.update(line.strip().split(',')[0])

In [None]:
len(vocab)

### Build dataset files

In [1]:
import re
ids_dict = {line.strip().split('\t')[1]:re.sub(r'\[.*\]', '', line.strip().split('\t')[2]) for line in open('ids.txt', 'r').readlines() if not line.startswith('#') and len(line.strip().split('\t')) > 1}

### Recursive IDS

In [2]:
# vocab_ids_dict
def get_full_ids(c):
    seq = ids_dict.get(c, c)
    if len(seq) > 1 and c not in seq:
        return ''.join([get_full_ids(cc) for cc in seq])
    return c     

In [3]:
ids_exp_dict = {k: get_full_ids(k) for k in ids_dict}

In [13]:
single = [v for v in ids_exp_dict.values() if len(v) <= 1]

In [14]:
# write to file
with open('single.txt', 'w') as f:
    for s in sorted(single):
        f.write(f'{s}\n')

In [18]:
# write to file
with open('ids_exp.txt', 'w') as f:
    f.writelines([f'{k}\t{v}\n' for k, v in ids_exp_dict.items()])


In [None]:
# build vocab
vocab_ids = set()
for k, v in ids_exp_dict.items():
    vocab_ids.update(v)

len(vocab_ids)

In [20]:
# write vocab ids full to file
with open('vocab_ids.txt', 'w') as f:
    f.write('\n'.join(sorted(vocab_ids)))

### Build encoder

In [1]:
# load vocab ids
base_vocab = open('vocab_ids.txt', 'r').read().split('\n')
ids_dict = {line.strip().split('\t')[0]:line.strip().split('\t')[1] for line in open('ids_exp.txt', 'r').readlines()}

In [None]:
from trie_search import Trie, TrieNode

class Vocab:
    def __init__(self, base_vocab, ids_dict):
        self.id2char = {i: c for i, c in enumerate(base_vocab)}
        self.char2id = {c: i for i, c in self.id2char.items()}
        self.size = len(base_vocab)
        self.ids_dict = ids_dict
        self.ids_dict_rev = {v: k for k, v in ids_dict.items()}

        self.trie = Trie()
        for k, v in ids_dict.items():
            self.trie.insert(self.encode(k))
        
    def __len__(self):
        return self.size
    
    def encode(self, c):
        return [self.char2id[c] for c in self.ids_dict[c]]

    def decode(self, ids):
        closest = self.trie.search_fuzzy(ids, max_distance=5)
        if len(closest) > 0:
            return self.ids_dict_rev[''.join([self.id2char[i] for i in closest[0][0]])]
        return None

In [3]:
vocab = Vocab(base_vocab, ids_dict)

In [None]:
vocab.decode(vocab.encode('閇'))