In [1]:
text = 'foo = 23 + 42 * 10'

In [2]:
tokens = [('NAME', 'foo'), ('EQ', '='), ('NUM', '23'), ('PLUS', '+'),
          ('NUM', '42'), ('TIMES', '*'), ('NUM', '10')]
tokens

[('NAME', 'foo'),
 ('EQ', '='),
 ('NUM', '23'),
 ('PLUS', '+'),
 ('NUM', '42'),
 ('TIMES', '*'),
 ('NUM', '10')]

In [3]:
import re

In [4]:
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))

In [5]:
scanner = master_pat.scanner('foo = 42')
scanner.match()

<re.Match object; span=(0, 3), match='foo'>

In [6]:
_.lastgroup, _.group()

('NAME', 'foo')

In [7]:
scanner.match()

<re.Match object; span=(3, 4), match=' '>

In [8]:
_.lastgroup, _.group()

('WS', ' ')

In [9]:
scanner.match()

<re.Match object; span=(4, 5), match='='>

In [10]:
_.lastgroup, _.group()

('EQ', '=')

In [11]:
scanner.match()

<re.Match object; span=(5, 6), match=' '>

In [12]:
_.lastgroup, _.group()

('WS', ' ')

In [13]:
scanner.match()

<re.Match object; span=(6, 8), match='42'>

In [14]:
_.lastgroup, _.group()

('NUM', '42')

In [15]:
scanner.match()

In [16]:
from collections import namedtuple

In [17]:
Token = namedtuple('Token', ['type', 'value'])

In [18]:
def generate_tokens(pat, text):
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())

In [19]:
for tok in generate_tokens(master_pat, 'foo = 42'):
    print(tok)

Token(type='NAME', value='foo')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='42')


In [20]:
tokens = (tok for tok in generate_tokens(master_pat, text) if tok.type != 'WS')

for tok in tokens:
    print(tok)

Token(type='NAME', value='foo')
Token(type='EQ', value='=')
Token(type='NUM', value='23')
Token(type='PLUS', value='+')
Token(type='NUM', value='42')
Token(type='TIMES', value='*')
Token(type='NUM', value='10')


In [21]:
LT = r'(?P<LT><)'
LE = r'(?P<LE><=)'
EQ = r'(?P<EQ>=)'

In [22]:
master_pat = re.compile('|'.join([LE, LT, EQ]))

In [23]:
PRINT = r'(?P<PRINT>print)'
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'

master_pat = re.compile('|'.join([PRINT, NAME]))

for tok in generate_tokens(master_pat, 'printer'):
    print(tok)

Token(type='PRINT', value='print')
Token(type='NAME', value='er')
