In [1]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.token_id = None
        self.token = None
        self.child_token_count = 0

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, token, token_id):
        node = self.root
        for char in token:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
            node.child_token_count += 1
        node.token_id = token_id
        node.token = token
        node.child_token_count += 1

    def find_biggest_prefix(self, min_occurence):
        prefixes = {}
        self.find_biggest_prefix_for_node(self.root, prefixes)
        prefixes = {k:v for k, v in prefixes.items() if v > min_occurence}
        prefixes = dict(sorted(prefixes.items(), key=lambda item: item[1], reverse=True))
        return prefixes

    def find_biggest_prefix_for_node(self, node, prefixes):
        for child in node.children.values():
            prefixes[child.token] = child.child_token_count
            self.find_biggest_prefix_for_node(child, prefixes)

In [8]:
import json

with open('data/vocab.json', 'r', encoding='utf-8') as f: tokenizer_config = json.load(f)

suffix = True

trie = Trie()
for k, v in tokenizer_config.items():
    trie.insert(k[::-1] if suffix else k, v)

In [9]:
common_prefixes = trie.find_biggest_prefix(200)
common_prefixes = {(k[::-1] if suffix else k):v for k, v in common_prefixes.items() if len(k) > 0}
common_prefixes

{'s': 7297,
 'e': 5271,
 'd': 4025,
 'n': 3856,
 't': 3720,
 'r': 3038,
 'y': 3014,
 'g': 2747,
 'ed': 2580,
 'l': 2377,
 'ng': 2282,
 'ing': 2129,
 'es': 1842,
 'er': 1802,
 'on': 1709,
 'h': 1106,
 'ion': 1102,
 'a': 1086,
 'al': 1085,
 'm': 1040,
 'nt': 933,
 'ly': 898,
 'c': 883,
 'ts': 881,
 'tion': 877,
 'rs': 860,
 'ns': 794,
 'le': 778,
 'o': 726,
 'k': 710,
 'p': 691,
 'ted': 639,
 'st': 613,
 'an': 602,
 'ers': 592,
 'ent': 585,
 're': 550,
 'te': 544,
 'ce': 519,
 'ic': 510,
 'ation': 503,
 'en': 492,
 'i': 489,
 'ry': 486,
 'ty': 470,
 'in': 459,
 'ons': 451,
 'or': 446,
 've': 433,
 'ting': 422,
 'nd': 419,
 'ne': 414,
 'us': 393,
 'se': 386,
 'ss': 380,
 'b': 366,
 'u': 362,
 'ions': 360,
 'ter': 354,
 'ar': 337,
 'll': 328,
 'w': 328,
 'ies': 327,
 'et': 318,
 'ge': 312,
 'ate': 308,
 'ity': 305,
 'f': 303,
 'ch': 299,
 'ble': 298,
 'ive': 296,
 'nce': 296,
 'el': 296,
 'ds': 296,
 'red': 287,
 '0': 286,
 'ls': 284,
 'tions': 283,
 'ct': 283,
 'th': 274,
 'it': 273,
 'll