In [14]:
# Python standard re module cannot handle lookbehind as expected e.g.
# "look-behind requires fixed-width pattern" Error
import regex as re
import string

# Regular expression lookaround

* [Lookahead and Lookbehind](https://www.regular-expressions.info/lookaround.html) (MUST)

* [Mastering Lookahead and Lookbehind](https://www.rexegg.com/regex-lookarounds.html)

> (?<!foo)	<br>Negative Lookbehind	asserts that what immediately precedes the current position in the string is not foo

* [5.4. Find All Except a Specific Word](https://www.oreilly.com/library/view/regular-expressions-cookbook/9781449327453/ch05s04.html)

> match any complete word except cat. e.g. Catwoman, vindicate should match
```\b(?!cat\b)\w+```

# Positive Lookahead
Extract Target followed by Pattern ```target(?=pattern)```.

In [8]:
# Extract "Class" followed by a digit and capital letter
match = re.search(
    pattern="Class(?=\d[A-Z])", 
    string="Statistics Class3A in the 3rd grace.", 
    flags=re.IGNORECASE
)
match.group(0)

'Class'

# Positive Lookback

Extract Target preceeded by Pattern ```(?<=pattern)target```.

In [13]:
# Extract "Class" preceeded by Statistics or Mathematics
match = re.search(
    pattern="(?<=(Business|Statistics) )Class", 
    string="Statistics Class3A in the 3rd grace.", 
    flags=re.IGNORECASE)
match.group(0)

'Class'

## word boundary ```\b```

* [Word Boundaries](https://www.regular-expressions.info/wordboundaries.html)

> The metacharacter \b is an anchor like the caret and the dollar sign. It matches at a position that is called a “word boundary”. This match is zero-length.

In [None]:
string.punctuation

In [None]:
DELIMITER = " "
UNK = "<unk>"
NIL = "<nil>"  # Lower letter as lower() will be applied.
STRIDE = 2
CONTEXT_WINDOW_SIZE = 1 + (STRIDE * 2)
SPACE = ' '


In [None]:
text = f"""

the asbestos fiber {NIL} <unk> <unk| is < unusually <unk once it enters the <<unk>$% with   hong-kong \

"""
print(text)
replacement = " "
pattern: str = r'(?<!<unk)[%s%s]+(?!unk>)' % (re.escape(string.punctuation), r"\s")

re.sub(pattern=pattern, repl=replacement, string=text, flags=re.IGNORECASE).lower().strip()

* [Python regexp - remove punctuation but leave <unk> as is](https://stackoverflow.com/a/67165082/4281353)

```
(?: - start of a non-capturing group:
(?!<unk>) - the nex char should not be a starting char of a <unk> char sequence
[\W_] - any non-alphanumeric char
(?<!<unk>) - the previously matched char (with [\W_]) cannot be a starting char of an <unk> char sequence
)+ - one or more times.
```

* [Multiple Regex Matches Using Negative Lookbehind](https://stackoverflow.com/a/35580141/4281353)

In [None]:
pattern: str = rf'(?:(?!{UNK.lower()})(?!{NIL.lower()})[\W_](?<!{UNK.lower()})(?<!{NIL.lower()}))+'
replacement = " "
standardized: str = re.sub(
            pattern=pattern,
            repl=replacement,
            string=text,
            flags=re.IGNORECASE
        ).lower().strip()
standardized

In [None]:
EMPTY = ''
EVENT_UNK = '<unk>'
EVENT_NIL = '<nil>'

pattern: str = rf'(?:(?!{EVENT_UNK.lower()})(?!{EVENT_NIL.lower()})[\W_](?<!{EVENT_UNK.lower()})(?<!{EVENT_NIL.lower()}))+'
replacement = SPACE
standardized: str = re.sub(
    pattern=pattern,
    repl=replacement,
    string=text,
    flags=re.IGNORECASE
).lower().strip()
standardized

In [None]:
'event_indexing must fail with corpus including EVENT_NIL <nil> words.'