# Regular expression misc

## Capturing vs. non-capturing groups

In [11]:
# 找出字串中所有 e.g or e.g. (不分大小寫)
# Find all e.g or e.g. (case-insensitive)

import re

# https://regex101.com/r/8MEb0b/1

pat = "(?:^|\s)([e]\.[g](\.)?)(?=\s|$)"

# Non-capturing group: (?:^|\s)
## 1st Alternative: ^ asserts position at start of a line
## 2nd Alternative: \s matches any whitespace character 

# 1st Capturing Group: ([e]\.[g](\.)?)
## Match a single character present in the list below [e]
## \. matches the character . 
## Match a single character present in the list below [g]

# 2nd Capturing Group: (\.)?
## ? matches the previous token between zero and one times,

# Positive Lookahead: (?=\s|$)
## Assert that the Regex below matches
## 1st Alternative \s matches any whitespace character 
## 2nd Alternative $ asserts position at the end of a line

data = "e.g  E.g E.G. e.g."
regex = re.compile(pat, re.IGNORECASE)     # note the IGNORECASE
print(regex.findall(data))

matches = regex.finditer(data)
ans = [(m.group(0), m.group(1), m.group(2)) for m in matches]
print(ans)

[('e.g', ''), ('E.g', ''), ('E.G.', '.'), ('e.g.', '.')]
[('e.g', 'e.g', None), (' E.g', 'E.g', None), (' E.G.', 'E.G.', '.'), (' e.g.', 'e.g.', '.')]


## The list returned by re.findall

r'(Mr|Ms|Mrs)\\.?\s[A-Z]\w*' 

+ `(Mr|Ms|Mrs)`: This part of the pattern is a group that matches one of the specified honorific titles. The | symbol acts as an OR operator, allowing it to match any of the three titles: “Mr”, “Ms”, or “Mrs”.

+ `\\.?`: The dot (.) is a special character in regular expressions that matches any character except for a newline. The \\ before the dot is an escape sequence to treat it as a literal dot. The ? following the dot makes it optional, allowing for both “Mr” and “Mr.” to match.
+ `\\s`: This matches any whitespace character (such as a space or tab).

+ `[A-Z]`: This character class matches any uppercase letter.

+ `\\w*`: The \w represents any word character (letters, digits, or underscores), and the * quantifier means zero or more occurrences. So, this part matches any sequence of word characters (e.g., a name).

In [2]:
# The list returned by re.findall contains:

## 1. the text of each match, if the regex has no captures
## 2. the text of the capture in each match, if the regex has exactly one capture
## 3. a tuple of substrings for each capture, if the regex has has more than one capture.

import re

# https://regex101.com/r/EEGVyC/1

text2 = '''
Mr. Schafer
Mr Smith
Ms davis
Mrs. Robinson
Mr. T
Mr z
'''

regex1 = r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*'  			# one capture
regex2 = r'(?:Mr|Ms|Mrs)\.?\s[A-Z]\w*'			# no capture, show the matched text
regex3 = r'(?:(Mr|Ms|Mrs)\.?\s([A-Z]\w*))'  # two captures, don't show the matched text

print(re.findall(regex1, text2))
print(re.findall(regex2, text2, re.IGNORECASE))
print(re.findall(regex3, text2))

['Mr', 'Mr', 'Mrs', 'Mr']
['Mr. Schafer', 'Mr Smith', 'Ms davis', 'Mrs. Robinson', 'Mr. T', 'Mr z']
[('Mr', 'Schafer'), ('Mr', 'Smith'), ('Mrs', 'Robinson'), ('Mr', 'T')]


## non capturing group vs. positive look ahead

In [3]:
# ?:  is for non capturing group
# ?=  is for positive look ahead 
#  if you want to match something followed by something else without including the else
# ?!  is for negative look ahead
# ?<= is for positive look behind
# ?<! is for negative look behind

# for example a(?:b) will match the "ab" in "abc", i.e., the matched text must contain 'b', but
# wouldn't capture 'b' as a group 
# while a(?=b) will only match the "a" (that followed by 'b') in "abc". 
# a(b) would match the "ab" in "abc" and create a capture containing the "b".

import re

s= 'abcefgab'

# re.finditer() returns iterator of matched objects in the string 
# while re.findall() returns list of matched patterns in the string.

matches = re.finditer(r'a(b)', s)
for m in matches:
  print(m, m.group(0), m.group(1))

matches = re.findall(r'a(b)', s)
print(matches)


<re.Match object; span=(0, 2), match='ab'> ab b
<re.Match object; span=(6, 8), match='ab'> ab b
['b', 'b']


In [4]:
matches = re.finditer(r'a(?:b)', s) # no capture,show the matched text
for m in matches:
  print(m, m.group(0))
	
matches = re.findall(r'a(?:b)', s)  # no capture,show the matched text
print(matches)

<re.Match object; span=(0, 2), match='ab'> ab
<re.Match object; span=(6, 8), match='ab'> ab
['ab', 'ab']


In [5]:
# find 'a' that is followed by 'b'
matches = re.finditer(r'a(?=b)', s)
for m in matches:
  print(m, m.group(0))
	
matches = re.findall(r'a(?=b)', s)
print(matches)

<re.Match object; span=(0, 1), match='a'> a
<re.Match object; span=(6, 7), match='a'> a
['a', 'a']


## Regex findall repeated characters

In [9]:
import re

# https://regex101.com/r/k0MVvE/1

target_string = "Jessa Erriika aaa"

# This '\w' matches any single character
# and then its repetitions (\1*) if any.
regex = re.compile(r"(\w)\1*")

for match in regex.finditer(target_string):
    print(match.group(), end=", ")
print()
# output J, e, ss, a, E, rr, ii, k, a,

# This '\w' matches any single character
# and then its repetitions (\1{n}) n times.
n = 2
regex = re.compile(fr"(\w)\1{{{n}}}")

for match in regex.finditer(target_string):
    print(match.group(), end=", ")
print()
# output n=1: ss, rr, ii, aa; n=2: aaa

# Regex findall mutiple characters
# This '\w' matches any single character
# and then its greedy quantifier {n,m} if any.
regex = re.compile(r"(\w){1,}")

for match in regex.finditer(target_string):
    print(match.group(), end=", ")
print()
# output Jessa, Erriika

J, e, ss, a, E, rr, ii, k, a, aaa, 
aaa, 
Jessa, Erriika, aaa, 


In [10]:
# Regex findall repeated characters n+1 times
# https://regex101.com/r/C5F4iK/1

numbers = '555512346666'

# Note (\d) is the first captured-group.
regex = re.compile(r"(\d)\1{3}")
for match in regex.finditer(numbers):
    print(match.group(), end=", ")
print()

# any combinations of 3 digits
regex = re.compile(r"(\d){3}")
for match in regex.finditer(numbers):
    print(match.group(), end=", ")
print()

# three repeated 5
regex = re.compile(r"(5){3}")
for match in regex.finditer(numbers):
    print(match.group(), end=", ")
print()

5555, 6666, 
555, 512, 346, 666, 
555, 
