In [2]:
from pprint import pprint, pformat
import re

In [48]:
# Example: Match any string that does NOT contain the word "cat"
lines = [
    "dog",
    "my cat",
    "catfish",
    "dog and cat",
]

pattern = r'^(?!.*cat).*'
# Usage:
# re.match(pattern, "dog")      # Match
# re.match(pattern, "my cat")   # No match

for line in lines:
    if re.match(pattern, line):
        print(f"Matched: {line}")
    else:
        print(f"No match: {line}")

Matched: dog
No match: my cat
No match: catfish
No match: dog and cat


In [49]:
# Example: Match any 'cat' that is not immediately preceded by "dog "
# note: (?<!...) is a negative lookbehind assertion, not capturing.
pattern_negative_lookbehind = r'(?<!dog )(cat)'

test_strings = [
    "dog cat",      # No match (cat is preceded by 'dog ')
    "my cat",       # Match (cat is not preceded by 'dog ')
    "catfish",      # Match (cat is not preceded by 'dog ')
    "dog and cat",  # Match (cat is not immediately preceded by 'dog ')
]

for s in test_strings:
    if m := re.search(pattern_negative_lookbehind, s):
        # print each captured group in a new line
        print(f"Matched: {s}")
        for i, group in enumerate(m.groups(), start=1):
            print(f"   Group {i}: {group}")
    else:
        print(f"No match: {s}")

No match: dog cat
Matched: my cat
   Group 1: cat
Matched: catfish
   Group 1: cat
Matched: dog and cat
   Group 1: cat


In [50]:
# (?:...) is a non-capturing group
lines = [
    "dog",
    "my cat",
    "catfish",
    "dog and cat",
]

# Example: Non-capturing group pattern to match "cat" or "dog" without capturing the group

non_capture_pattern = r'(.*)(?:cat|dog)(.*)'

for text in lines:
    if m := re.search(non_capture_pattern, text):
        print(f"Matched: {text}")
        # print each captured group in a new line
        for i, group in enumerate(m.groups(), start=1):
            print(f"    Group {i}: {group}")
    else:
        print(f"No match: {text}")

Matched: dog
    Group 1: 
    Group 2: 
Matched: my cat
    Group 1: my 
    Group 2: 
Matched: catfish
    Group 1: 
    Group 2: fish
Matched: dog and cat
    Group 1: dog and 
    Group 2: 


In [None]:
s = "my request leve (P3)"

p = re.compile(r"\(p([1-5])\)", re.IGNORECASE)
if m := p.search(s):
    print("matched")
    pprint(m.groups())

In [None]:
# catch string in quotes, including escaped quotes
# https://stackoverflow.com/questions/249791/regex-for-quoted-string-with-escaping-quotes
lines = [
     r'''child_window(title="hello \"world\"", value="hello \"world\"", control_type="Document")''',
     r'''child_window(
         title="multi line\rhello \"world\"\r", 
         value="multi line\rhello \"world\"\r", 
         control_type="Document"
     )''',
]

for line in lines:
    # get the title value
    p = re.compile(r'title="((?:[^"\\]|\\.)*)"', re.IGNORECASE)
    '''
    Plain english: Two quotes surrounding zero or more of "any character that's not a 
    quote or a backslash" or "a backslash followed by any character".

    (?:...) is a passive or non-capturing group
    '''
    if m := p.search(line):
        print("-----------------------------------")
        print(f"original: {line}")
        print("matched")
        pprint.pprint(m.groups())
        title = m.groups()[0]
        print(f"title: {title}")

        print("substitue escaped quotes with single quote in title")
        new_title = title.replace(r'\"', "'")
        print(f"new title: {new_title}")

        print("replace old title with new title in original line, without changing anything else")
        # replace is not regex, so no need to escape quotes, and therefore is simpler.
        new_line = line.replace(f'title="{title}"', f'title="{new_title}"')
        print(f"new line: {new_line}")
        
    


-----------------------------------
original: child_window(title="hello \"world\"", value="hello \"world\"", control_type="Document")
matched
('hello \\"world\\"',)
title: hello \"world\"
substitue escaped quotes with single quote in title
new title: hello 'world'
replace old title with new title in original line, without changing anything else
new line: child_window(title="hello 'world'", value="hello \"world\"", control_type="Document")
-----------------------------------
original: child_window(
         title="multi line\rhello \"world\"\r", 
         value="multi line\rhello \"world\"\r", 
         control_type="Document"
     )
matched
('multi line\\rhello \\"world\\"\\r',)
title: multi line\rhello \"world\"\r
substitue escaped quotes with single quote in title
new title: multi line\rhello 'world'\r
replace old title with new title in original line, without changing anything else
new line: child_window(
         title="multi line\rhello 'world'\r", 
         value="multi line\rhel

In [None]:
# match groups enclosed in sinqle or double quotes, with escaped single or double quotes inside
# https://stackoverflow.com/questions/249791/regex-for-quoted-string-with-escaping-quotes

tests = [
    # # double quotes
    r'''before "hello \"world\"" behind''',

    r'''before "multi line double quote
    hello \"world\"" behind''',

    # single quotes
    r"""'hello \'world\''""",

    r"""'multi line single quote
    hello \'world\''""",

    # multiple quoted strings, mixed single and double, multiline
    r'''"hello \"world\"",'one \'up\'', 'two \'down\'' ''',

    r'''
    "hello \"world\"",
    'one \'up\'',
    'two 
    \'down\''
    ''',
]

# p = re.compile(r'''(["'])((?:[^\1\\]|\\.)*?\1)''',
p = re.compile(r'''(["'])((?:[^\1\\]|\\.)*?\1)''',
                re.MULTILINE
                # | re.IGNORECASE 
                # | re.VERBOSE 
                | re.DOTALL
                )

for t in tests:
    print("===================================")
    print(f"original: {t}")
    if m := p.findall(t): # match repeating non-overlapping patterns
        print("matched")
        print(f"m = {pformat(m)}")
        for g in m:
            print(f"enclosed_stirng={g[0]}{g[1]}")
    else:
        print("no match")
    print()


original: before "hello \"world\"" behind
matched
m = [('"', 'hello \\"world\\""')]
enclosed_stirng="hello \"world\""

original: before "multi line double quote
    hello \"world\"" behind
matched
m = [('"', 'multi line double quote\n    hello \\"world\\""')]
enclosed_stirng="multi line double quote
    hello \"world\""

original: 'hello \'world\''
matched
m = [("'", "hello \\'world\\''")]
enclosed_stirng='hello \'world\''

original: 'multi line single quote
    hello \'world\''
matched
m = [("'", "multi line single quote\n    hello \\'world\\''")]
enclosed_stirng='multi line single quote
    hello \'world\''

original: "hello \"world\"",'one \'up\'', 'two \'down\'' 
matched
m = [('"', 'hello \\"world\\""'), ("'", "one \\'up\\''"), ("'", "two \\'down\\''")]
enclosed_stirng="hello \"world\""
enclosed_stirng='one \'up\''
enclosed_stirng='two \'down\''

original: 
    "hello \"world\"",
    'one \'up\'',
    'two 
    \'down\''
    
matched
m = [('"', 'hello \\"world\\""'),
 ("'", "one \\

In [None]:
# test re.findall()

tests = [
    "Number 123 and 456.",
    '''
    Number 123
    and 456.
    ''',
]

compiled = re.compile(r"\d+",
                      re.MULTILINE
                      # | re.IGNORECASE 
                      # | re.VERBOSE 
                      | re.DOTALL
                      )
for t in tests:
    print("-----------------------------------")
    print(f"string={t}")
    if m := compiled.findall(t):
        print(f"m = {pformat(m)}")
        for g in m:
            print(f"number={g}")
    else:
        print(f"no match")
    print()

-----------------------------------
string=Number 123 and 456.
m = ['123', '456']
number=123
number=456

-----------------------------------
string=
    Number 123
    and 456.
    
m = ['123', '456']
number=123
number=456



In [47]:
# my old way to parse repeated pattern - by removing the matched part and re-matching the rest

string = r'''xpath=//input[@id="user id"],
            css=#user\ id,
            xpath=//tr[class="non exist"]
         '''

locator_compiled_path1 = re.compile(r'(?:\n|\r\n|\s?)*(css|xpath)=(.+)', re.MULTILINE|re.DOTALL)
locator_compiled_path2 = re.compile(r'(.+?)(?:\n|\r\n|\s?)*,(?:\n|\r\n|\s?)*(css|xpath)=', re.MULTILINE|re.DOTALL)

if m1:=locator_compiled_path1.match(string):
    type, path_string = m1.groups()
    print(f'extracted {type}, {path_string}')

    type_paths = []

    while m2:=locator_compiled_path2.match(path_string):
        path, type2 = m2.groups()
        type_paths.append([type, path])
        print(f'added {type}, {path}')

        type = type2

        pos = m2.end()
        path_string = path_string[pos:]

    type_paths.append([type, path_string])
    print(f'at last added {type}, {path_string}')

    print(f'{pformat(type_paths)}')

extracted xpath, //input[@id="user id"],
            css=#user\ id,
            xpath=//tr[class="non exist"]
         
added xpath, //input[@id="user id"]
added css, #user\ id
at last added xpath, //tr[class="non exist"]
         
[['xpath', '//input[@id="user id"]'],
 ['css', '#user\\ id'],
 ['xpath', '//tr[class="non exist"]\n         ']]


In [7]:
# parse key=value.
# challenge: the delimiter between key=value pairs may vary.
tests = [
    b'11=ABC123,35=D,54=1',
    b'11=ABC123|35=D|54=1',
    b'11=ABC123;35=D;54=1',
    b'11=ABC123 35=D 54=1',
]

'''
(?P<name>...) is a named capturing group
(?P=name) is a backreference to the named capturing group
'''
pattern = rb'(?P<delimiter>[^0-9a-zB-Z_-]{1,2})\d+=[^=]+?(?P=delimiter)'
compiled = re.compile(pattern)

for t in tests:
    print("===================================")
    print(f"original: {t}")

    # first find the delimiter
    if m := compiled.search(t):
        print("matched")
        delimiter = m.group('delimiter')
        print(f"delimiter: {delimiter}")

        # now split by the delimiter
        pairs = t.split(delimiter)
        for pair in pairs:
            print(f"pair: {pair}")
    else:
        print("no match")
    

original: b'11=ABC123,35=D,54=1'
matched
delimiter: b','
pair: b'11=ABC123'
pair: b'35=D'
pair: b'54=1'
original: b'11=ABC123|35=D|54=1'
matched
delimiter: b'|'
pair: b'11=ABC123'
pair: b'35=D'
pair: b'54=1'
original: b'11=ABC123;35=D;54=1'
matched
delimiter: b';'
pair: b'11=ABC123'
pair: b'35=D'
pair: b'54=1'
original: b'11=ABC123 35=D 54=1'
matched
delimiter: b' '
pair: b'11=ABC123'
pair: b'35=D'
pair: b'54=1'


In [16]:
# substitute

s1 = "the blue dog and blue cat wore blue hats"

# two-step approach,
pattern = r"blue (dog|cat)"
replacement = r"gray \1"

s2 = re.compile(pattern).sub(replacement, s1)
print(f"s1={s1}")
print(f"s2={s2}")

# one-liner
s3 = re.sub(pattern=r"blue (dog|cat)", repl=r"gray \1", string=s1)
print(f"s3={s3}")

# named group
p = re.compile(r"blue (?P<animal>dog|cat)")
s4 = p.sub(r"gray \g<animal>", s1)
print(f"s4={s4}")

s1=the blue dog and blue cat wore blue hats
s2=the gray dog and gray cat wore blue hats
s3=the gray dog and gray cat wore blue hats
s4=the gray dog and gray cat wore blue hats


In [None]:
# wild card - convert cygwin path to windows pat
s1 = "/cygdrive/c/Program Files;/cygdrive/c/Users;/cygdrive/d"
p = re.compile(r"/cygdrive/(.)(.*?)(;?)")
s2 = p.sub(r"\1:\2\3", s1)
print(f"s1={s1}")
print(f"s2={s2}")
print()


# remove unwanted characters
s1 = "a/b/c;d e-f_g,"
p = re.compile("[^a-zA-Z0-9.,_-]")
s2 = p.sub("", s1)
print(f"s1={s1}")
print(f"s2={s2}")
print()


s1=/cygdrive/c/Program Files;/cygdrive/c/Users;/cygdrive/d
s2=c:/Program Files;c:/Users;d:

s1=a/b/c;d e-f_g,
s2=abcde-f_g,

