In [61]:
import re

In [62]:
# Patterns
IDENT   = r"[A-Za-z_]\w*"                 # variable / function names
INT     = r"\d+"                           # integer literal
STRING  = r'"[^"\\]*(?:\\.[^"\\]*)*"'      # simple double-quoted string with escapes
ASSIGN  = r"="
OP      = r"[+\-*/]"
LPAREN  = r"\("
RPAREN  = r"\)"
COMMA   = r","
COMMENT = r"#.*"
WS      = r"[ \t]+"

In [63]:
# re.match (match — must start at the beginning (but can be a prefix))
# Use when you expect a token right at the start (e.g., lexing the next token).
expr = 'count = 10'
matched = re.match(IDENT, expr)
matched.group()

'count'

In [64]:
# re.fullmatch - must match the entire string
# Great for validating a whole line is exactly one kind of statement.
pattern_to_match = rf'{IDENT}\s*{ASSIGN}\s*{INT}'
re.fullmatch(pattern_to_match, expr) is not None

True

In [65]:
expr_wrong = 'count = 10;'
re.fullmatch(pattern_to_match, expr_wrong) is not None

False

In [66]:
# re.search — find the first occurrence anywhere
# Use when the token may appear anywhere (e.g., scan a comment string for a number).
expr_1 = 'result = 42 + x'
re.search(INT, expr_1).group()

'42'

In [67]:
# re.findall — return all matches as strings/tuples
# If the pattern has no capturing groups, you get a list of strings:
expr_2 = 'sum = add(x, y)'
re.findall(IDENT, expr_2)

['sum', 'add', 'x', 'y']

In [68]:
# If the pattern has capturing groups, you get tuples of captured pieces:
# capture left var, right int
pattern_to_find = rf'({IDENT})\s*=\s*({INT})'
expr_3 = 'a=1; b = 2; c = x'
re.findall(pattern_to_find, expr_3)

[('a', '1'), ('b', '2')]

In [69]:
# if you only care about the whole match --> to remove () --> not a tuple anymore
pattern_to_find_all = rf'{IDENT}\s*=\s*{INT}'
re.findall(pattern_to_find_all, expr_3)

['a=1', 'b = 2']

In [70]:
# re.finditer — like findall but yields Match objects
# This is perfect for tokenization because you get positions and names.
code = 'total = add(10, x)  # sum'
group_objects = list(re.finditer(IDENT, code))
for matched_object in group_objects:
    print(matched_object.group(), matched_object.start(), matched_object.end())

total 0 5
add 8 11
x 16 17
sum 22 25


In [84]:
# to return a tuple of the captured groups --> Pattern should have groups in ()
pattern_with_groups = r'(?P<variable>[A-Za-z_]\w*)'
group_objects = list(re.finditer(pattern_with_groups, code))
for matched_object in group_objects:
    print(matched_object.groups())
    print(matched_object.groupdict())  # named captures

('total',)
{'variable': 'total'}
('add',)
{'variable': 'add'}
('x',)
{'variable': 'x'}
('sum',)
{'variable': 'sum'}


In [85]:
# re.sub and re.subn — rewrite code
# Rename identifiers
pattern_to_sub = rf'\b{IDENT}\b'
replacement = lambda m: 'bar' if m.group() == 'foo' else m.group()
code = 'foo=1; foo+2'
re.sub(pattern_to_sub, replacement, code)

'bar=1; bar+2'

In [86]:
# Reformat function calls into a different syntax
call = rf'(?P<fn>{IDENT})\s*\(\s*(?P<args>[^)]*)\)'
code = 'sum(a, b)'
re.sub(call, r'\g<fn>[ \g<args>]', code)

'sum[ a, b]'

In [87]:
# subn also returns the number of replacements
code = 'x=1+2+300'
re.subn(INT, 'NUM', code)

('x=NUM+NUM+NUM', 3)

In [89]:
# re.split — split by token delimiters
abc = 'a, b, c'
pattern_to_split = rf'\s*{COMMA}\s*'
re.split(pattern_to_split, abc)

['a', 'b', 'c']

In [90]:
# re.compile — pre-compile for speed & reuse
ASSIGN_STMT = re.compile(rf"^(?P<lhs>{IDENT})\s*=\s*(?P<rhs>{IDENT}|{INT}|{STRING})$")
m = ASSIGN_STMT.fullmatch('msg = "hello"')
m.groupdict()

{'lhs': 'msg', 'rhs': '"hello"'}

In [91]:
# Useful flags
# re.IGNORECASE (re.I) — case-insensitive identifiers (if your language allows)
# re.MULTILINE (re.M) — ^ and $ match line starts/ends within a block
# re.DOTALL (re.S) — . matches newlines (helpful in string/comment blocks)
code = '# a\nx=1\n# b'
pattern_to_match = r'^\s*#.*$'
flags = re.M
re.findall(pattern_to_match, code, flags)

['# a', '# b']

## Mini Tasks with Solutions
#### A. Validate a Simple Assignment

In [92]:
ASSIGN_STMT = rf"^(?P<lhs>{IDENT})\s*=\s*(?P<rhs>{STRING}|{INT}|{IDENT})$"

tests = [
    "x=10",
    'title = "Report"',
    "a = b",
    "1x = 5",
    "name = 'oops'"
]

for t in tests:
    print(t, bool(re.fullmatch(ASSIGN_STMT, t)))

x=10 True
title = "Report" True
a = b True
1x = 5 False
name = 'oops' False


#### B. Find all function calls and their argument lists

In [95]:
CALL = rf"(?P<fn>{IDENT})\s*\(\s*(?P<args>[^)]*)\)"
code = 'print("ok"); add(1,2); f ( x , y )'
for mo in re.finditer(CALL, code):
    print(mo.group('fn'), "->", [a.strip() for a in re.split(rf"\s*{COMMA}\s*", mo.group('args')) if a.strip()])

print -> ['"ok"']
add -> ['1', '2']
f -> ['x', 'y']


#### C. Strip comments (but keep strings intact)

In [96]:
LINE = rf'{STRING}|{COMMENT}'
def strip_comment(line: str) -> str:
    out = []
    idx = 0
    for mo in re.finditer(LINE, line):
        if mo.group().startswith("#"):
            # keep everything before the comment, then stop
            return line[:mo.start()].rstrip()
        else:
            # it's a STRING, skip over it
            idx = mo.end()
    return line.rstrip()

strip_comment('msg = "# not a comment"  # real comment')


'msg = "# not a comment"'

#### D. Rename a variable safely using word boundaries

In [98]:
def rename_var(src: str, old: str, new: str) -> str:
    pat = rf"\b{re.escape(old)}\b"
    print(pat)
    return re.sub(pat, new, src)

rename_var("foo=1; foobar=2; foo + bar", "foo", "n")


\bfoo\b


'n=1; foobar=2; n + bar'

#### E. Extract all identifiers not in comments/strings

In [99]:
TOKEN = rf"{STRING}|{COMMENT}|{IDENT}"
def ids_no_strings_comments(code: str):
    ids = []
    for mo in re.finditer(TOKEN, code):
        tk = mo.group()
        if tk.startswith("#") or tk.startswith('"'):
            continue
        ids.append(tk)
    return ids

ids_no_strings_comments('print("x # y"); # comment\nvalue=10')

['print', 'value']

In [None]:
'''
Quick “when to use what” cheat-sheet

Method	                Best for	                                    Returns
match	                token at start of string	                    Match or None
fullmatch	            validate whole string equals a rule	            Match or None
search	                first occurrence anywhere	                    Match or None
findall	                all occurrences (text or tuples if groups)	    list
finditer	            all occurrences with positions & groups	        iterator of Match
sub	                    rewrite strings (rename, reformat)	            new string
subn	                like sub + count	                            (new_string, count)
split	                split on token delimiters	                    list
compile	                precompile pattern (speed, named groups, reuse)	Pattern object
10) Small practice set (answers below)

Validate lines that are only an identifier:

_ok, _1, a9    |    9a, -x, ""


Find all assignments and return (lhs, rhs) pairs from:

x=1; y = x; title="Hi"; bad = 'nope'


Extract function names from calls:

print("a"); add(1,2); _mix42 ( x )


Replace foo with bar only when it’s a standalone identifier in:

foo=1; foobar=2; foo_x=3; foo + 4


Strip trailing comments but keep # inside strings:

'a#b'  # comment

Answer key (sketch)

re.fullmatch(IDENT, s) → True for _ok, _1, a9; False for 9a, -x, "".

re.findall(rf"({IDENT})\s*=\s*({STRING}|{INT}|{IDENT})", src)
→ [('x','1'), ('y','x'), ('title','"Hi"')]

re.findall(rf"{IDENT}\s*\(", src) and strip (, or better named:

[mo.group('fn') for mo in re.finditer(rf"(?P<fn>{IDENT})\s*\(", src)]
# ['print','add','_mix42']


re.sub(r"\bfoo\b", "bar", src) → bar=1; foobar=2; foo_x=3; bar + 4

Function strip_comment above → result: '\'a#b\''
'''