In [1]:
from dataclasses import dataclass
from typing import List, Callable, Dict


In [2]:

@dataclass
class Token:
    '''Class representing a token with <type> and <value>

    Attributes:
        type (str): The token type.
        value (str): The token value.
    '''
    type: str
    value: str
#     target: str = ''

    def __repr__(self):
        return '({0}: "{1}")'.format(self.type, self.value)


@dataclass
class Block:
    '''Class representing a block with an <id> and <content>

    Attributes:
        id (str): A unique block id, likely a hash of its content. 
        content (List[Token]): The block's content; a list of tokens.
        indent (int): **this should probably be removed**
    '''
    id: str
    content: List[Token]
#     indent: int

    def __eq__(self, other) -> bool:
        return self.id == other.id

    def __repr__(self):
        return self.id + ': ' + ' '.join([x.value for x in self.content])


In [6]:
print(Token('REF', 'George Washington'))
print(Block('<some id>', [Token('REF', 'George Washington')]))

(REF: "George Washington")
<some id>: George Washington


In [7]:
import re

def tokenize(text, groups):
    tokens = []

    index = 0
    while True:
        next_matches = []
        for gname in groups.keys():
            match = re.search(groups[gname]['open'], text[index:])
            if match:
                next_matches.append((match.span()[0], gname))

        if len(next_matches) > 0:
            m = min(next_matches, key=lambda x: x[0])
            match_index = m[0] + index
            match_type = m[1]
            close_index = re.search(groups[match_type]['close'], text[match_index:]).span()[1] + match_index

            tokens.append(Token('TXT', text[index:match_index]))
            tokens.append(Token(match_type, text[match_index:close_index]))
            index = close_index
        else:
            if len(text[index:]) > 0:
                tokens.append(Token('TXT', text[index:]))
            break

    return tokens


In [9]:
groups = {
    'ref': {
        'open': r'\[\[',
        'close': r'\]\]'
    },
    'keyword': {
        'open': r'\\',
        'close': r'.(?=\s|$)'
    }
}
text = r'good \bye [[moon]]! \yay'
tokenize(text, groups)

[(TXT: "good "),
 (keyword: "\bye"),
 (TXT: " "),
 (ref: "[[moon]]"),
 (TXT: "! "),
 (keyword: "\yay")]