In [61]:
import re

In [62]:
# Patterns
IDENT   = r"[A-Za-z_]\w*"                 # variable / function names
INT     = r"\d+"                           # integer literal
STRING  = r'"[^"\\]*(?:\\.[^"\\]*)*"'      # simple double-quoted string with escapes
ASSIGN  = r"="
OP      = r"[+\-*/]"
LPAREN  = r"\("
RPAREN  = r"\)"
COMMA   = r","
COMMENT = r"#.*"
WS      = r"[ \t]+"

In [63]:
# re.match (match — must start at the beginning (but can be a prefix))
# Use when you expect a token right at the start (e.g., lexing the next token).
expr = 'count = 10'
matched = re.match(IDENT, expr)
matched.group()

'count'

In [64]:
# re.fullmatch - must match the entire string
# Great for validating a whole line is exactly one kind of statement.
pattern_to_match = rf'{IDENT}\s*{ASSIGN}\s*{INT}'
re.fullmatch(pattern_to_match, expr) is not None

True

In [65]:
expr_wrong = 'count = 10;'
re.fullmatch(pattern_to_match, expr_wrong) is not None

False

In [66]:
# re.search — find the first occurrence anywhere
# Use when the token may appear anywhere (e.g., scan a comment string for a number).
expr_1 = 'result = 42 + x'
re.search(INT, expr_1).group()

'42'

In [67]:
# re.findall — return all matches as strings/tuples
# If the pattern has no capturing groups, you get a list of strings:
expr_2 = 'sum = add(x, y)'
re.findall(IDENT, expr_2)

['sum', 'add', 'x', 'y']

In [68]:
# If the pattern has capturing groups, you get tuples of captured pieces:
# capture left var, right int
pattern_to_find = rf'({IDENT})\s*=\s*({INT})'
expr_3 = 'a=1; b = 2; c = x'
re.findall(pattern_to_find, expr_3)

[('a', '1'), ('b', '2')]

In [69]:
# if you only care about the whole match --> to remove () --> not a tuple anymore
pattern_to_find_all = rf'{IDENT}\s*=\s*{INT}'
re.findall(pattern_to_find_all, expr_3)

['a=1', 'b = 2']

In [70]:
# re.finditer — like findall but yields Match objects
# This is perfect for tokenization because you get positions and names.
code = 'total = add(10, x)  # sum'
group_objects = list(re.finditer(IDENT, code))
for matched_object in group_objects:
    print(matched_object.group(), matched_object.start(), matched_object.end())

total 0 5
add 8 11
x 16 17
sum 22 25


In [84]:
# to return a tuple of the captured groups --> Pattern should have groups in ()
pattern_with_groups = r'(?P<variable>[A-Za-z_]\w*)'
group_objects = list(re.finditer(pattern_with_groups, code))
for matched_object in group_objects:
    print(matched_object.groups())
    print(matched_object.groupdict())  # named captures

('total',)
{'variable': 'total'}
('add',)
{'variable': 'add'}
('x',)
{'variable': 'x'}
('sum',)
{'variable': 'sum'}
