In [None]:
import re

## BASIC SUBSTRING MATCHING

In [None]:
text = "My name is Tarun Jain and I work as a AI engineer."
pattern = r"Tarun"
result = re.search(pattern, text)

In [None]:
result

<re.Match object; span=(11, 16), match='Tarun'>

In [None]:
result.group()

'Tarun'

In [None]:
pattern = r"software"
result = re.search(pattern, text)
result

In [None]:
print(result)

None


## Case Sensitivity and Case in-sensitivity

In [None]:
text2 = "Python is popular programming language"

In [None]:
pattern = "Python"
result = re.search(pattern, text2)
print(result.group())

Python


In [None]:
pattern = "python"
result = re.search(pattern, text2)
print(result.group()) # should give error as result is None

AttributeError: 'NoneType' object has no attribute 'group'

In [None]:
# Case-insensitive matching
pattern = r"python"
result = re.search(pattern, text2, re.IGNORECASE)
print(result.group())

Python


In [None]:
pattern = r"[pP]ython"
result = re.findall(pattern, text2)
print(result)

['Python']


## Basic Regular Expression Pattern

### Range

In [None]:
text3 = "The pin is A123 and zip code is 90210."

In [None]:
pattern = r"[A-Z]"
results = re.findall(pattern, text3)
print(f"Uppercase letters '[A-Z]': {results}")

Uppercase letters '[A-Z]': ['T', 'A']


In [None]:
pattern = r"[a-z]"
results = re.findall(pattern, text3)
print(f"Lowercase letters '[a-z]': {results[:10]}...")

Lowercase letters '[a-z]': ['h', 'e', 'p', 'i', 'n', 'i', 's', 'a', 'n', 'd']...


In [None]:
pattern = r"[0-9]"
results = re.findall(pattern, text3)
print(f"Digits '[0-9]': {results}")

Digits '[0-9]': ['1', '2', '3', '9', '0', '2', '1', '0']


In [None]:
pattern = r"[aeiou]"
results = re.findall(pattern, text3)
print(f"Vowels '[aeiou]': {results}")

Vowels '[aeiou]': ['e', 'i', 'i', 'a', 'i', 'o', 'e', 'i']


> \w is equivalent to the character class [a-zA-Z0-9_]

## Kleene

- `*` Zero or more occurance
- `+` One or more occurance

In [None]:
sentence = "Contact us at info@example.com or @twitter.com"

In [None]:
star = r"\w*@\w+\.\w+"
results_star = re.findall(star, sentence)
print(f"With *: {results_star}")

With *: ['info@example.com', '@twitter.com']


In [None]:
plus = r"\w+@\w+\.\w+"
results_plus = re.findall(plus, sentence)
print(f"With +: {results_plus}")

With +: ['info@example.com']


In [None]:
text = "I went to the store. I went tooo far. I need to go."

In [None]:
pattern = r"to*"
results = re.findall(pattern, text)
print(results)

['t', 'to', 't', 'to', 't', 'tooo', 'to']


In [None]:
text = "ht hat haat haaaaat"
pattern = r"ha*t"
results = re.findall(pattern, text)
print(f"'ha*t': {results}")

'ha*t': ['ht', 'hat', 'haat', 'haaaaat']


In [None]:
text = "I went to the store. I went tooo far. I need to go."

pattern = r"to+"
results = re.findall(pattern, text)
print(results)

['to', 'to', 'tooo', 'to']


In [None]:
text = "I went to the store. I went tooo far. I need to go."

pattern = r"or+"
results = re.findall(pattern, text)
print(results)

['or']


# Caret

## As Negation

In [None]:
text = "Budapest, Hungary’s capital, is 19th-century Chain Bridge connects the hilly Buda district with flat Pest."

pattern = r"[^A-Z]"
results = re.findall(pattern, text)
print("".join(results))

udapest, ungary’s capital, is 19th-century hain ridge connects the hilly uda district with flat est.


In [None]:
pattern = r"[^0-9]"
results = re.findall(pattern, text)
print("".join(results))

Budapest, Hungary’s capital, is th-century Chain Bridge connects the hilly Buda district with flat Pest.


## As Anchor

In [None]:
text = """Hello world
World hello
123 Main Street
Today is sunny"""

In [None]:
pattern = r"^Hello"
results = re.findall(pattern, text, re.MULTILINE)
print(results)

['Hello']


In [None]:
pattern = r"^\d{3}"
results = re.findall(pattern, text, re.MULTILINE)
print(results)

['123']


In [None]:
pattern = r"^Today"
results = re.findall(pattern, text, re.MULTILINE)
print(results)

['Today']


In [None]:
pattern = r"world$"
results = re.findall(pattern, text, re.MULTILINE)
print(results)

['world']


In [None]:
pattern = r"Today$"
results = re.findall(pattern, text, re.MULTILINE)
print(results)

[]


# Counters {}

In [None]:
text = "Phone numbers: 123-4567, 1234-567, 12-34567, 1234567"
pattern = r"\b\d{3}-\d{4}\b"

In [None]:
results = re.findall(pattern, text)
print(f"Phone numbers in format XXX-XXXX: {results}")

Phone numbers in format XXX-XXXX: ['123-4567']


In [None]:
text = "Passwords: a, ab, abc, abcd, abcde, abcdef"
pattern = r"\b[a-z]{4,6}\b"
results = re.findall(pattern, text)
print(f"Passwords with 4-6 characters: {results}")

Passwords with 4-6 characters: ['abcd', 'abcde', 'abcdef']


# Question Mark

In [None]:
text = "color colour flavor flavour"

pattern = r"colou?r"
results = re.findall(pattern, text)
print(results)

['color', 'colour']


In [None]:
pattern = r"flavou?r"
results = re.findall(pattern, text)
print(results)

['flavor', 'flavour']


## More operators - Shortcut

In [None]:
text = "Phone: 123-456-7890 Email: user@example.com"

In [None]:
pattern = r"\d"
results = re.findall(pattern, text)
print(f"'\\d' (digits): {results}")  # All digits

pattern = r"\D"
results = re.findall(pattern, text[:20])
print(f"'\\D' (non-digits) first 20 chars: {results}")  # Non-digits in first 20 chars

'\d' (digits): ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
'\D' (non-digits) first 20 chars: ['P', 'h', 'o', 'n', 'e', ':', ' ', '-', '-', ' ']


In [None]:
pattern = r"\w+"
results = re.findall(pattern, text)
print(f"'\\w+' (words): {results}")  # All words

pattern = r"\W"
results = re.findall(pattern, text)
print(f"'\\W' (non-word chars): {results}")  # Non-word characters

'\w+' (words): ['Phone', '123', '456', '7890', 'Email', 'user', 'example', 'com']
'\W' (non-word chars): [':', ' ', '-', '-', ' ', ':', ' ', '@', '.']


In [None]:
pattern = r"\s"
results = re.findall(pattern, text)
print(f"'\\s' (whitespace): {' '.join(['space' for _ in results])}")  # Spaces

pattern = r"\S"
results = re.findall(pattern, text)
print(''.join(results))  # Non Spaces

'\s' (whitespace): space space space
Phone:123-456-7890Email:user@example.com


# Grouping and Alternatives

Suppose we need to search for texts about pets; perhaps we are particularly interested in cats and dogs. In such a case, we might want to search for either the string cat or the string dog. Since we can’t use the square brackets to search for “cat or dog” (why can’t we say /[catdog]/?), we need a new operator, the `disjunction operator`, also called the `pipe symbol` `|`. The pattern /cat|dog/ matches either the string cat or the string dog.

In [None]:
text = "I like cats and dogs, but not rats. Rat is small"

pattern = r"(cat|dog|rat)s"
results = re.findall(pattern, text)
print(f"'(cat|dog|rat)s' (alternatives): {results}")

'(cat|dog|rat)s' (alternatives): ['cat', 'dog', 'rat']


> Tip, you can treat `|` as OR operation in regular expression

In [None]:
text = "apple juice, orange juice, apple pie, orange cake"
pattern = r"(apple|orange) (juice|pie|cake)"
results = re.findall(pattern, text)
print(results)

[('apple', 'juice'), ('orange', 'juice'), ('apple', 'pie'), ('orange', 'cake')]


## Substitution

In [None]:
text = "Phone: 123-456-7890. Feel free to reach out"
pattern = r"\d"  # Match any digit
replacement = "X"

result = re.sub(pattern, replacement, text)
print(result)

Phone: XXX-XXX-XXXX. Feel free to reach out


In [None]:
text = "Today is 12/25/2023"
pattern = r"(\d{2})/(\d{2})/(\d{4})"  # Capture groups: month, day, year
replacement = r"\3-\1-\2"  # Swap month & day

result = re.sub(pattern, replacement, text)
print(result)

Today is 2023-12-25


# Lookahead Assertions

Lookaheads let you check if a pattern exists (or doesn’t exist) ahead in the text, without "moving" your current position.

#### Positive Lookahead (?=pattern)

"Match only if the pattern is ahead."

Example:

- apple(?= pie) → Matches "apple" only if "pie" follows it.
- Matches "apple" in "apple pie" ✅ but not in "apple juice" ❌.

#### Negative Lookahead (?!pattern)

"Match only if the pattern is NOT ahead."

- apple(?! pie) → Matches "apple" only if "pie" doesn’t follow.
- Matches "apple" in "apple juice" ✅ but not in "apple pie" ❌.




In [None]:
text = "Singing loudly, dancing in the rain."
pattern = r"\b\w+ing(?=\W)"  # \W = non-word character (space, punctuation, etc.)

matches = re.findall(pattern, text)
print(matches)

['Singing', 'dancing']


\b ensures we match whole words (not parts of words).

\w+ing matches words like "singing", "dancing".

(?=\W) checks that "ing" is followed by a non-word character (like space or comma).



In [None]:
text = "20% discount, save 30 dollars, 100% free"
pattern = r"\b\d+\b(?!\s*%)"  # Match whole numbers not followed by %

matches = re.findall(pattern, text)
print(matches)

['30']


(?!\s*%): Negative lookahead that allows for optional whitespace before checking for %



## When to use it:

- Validate patterns ahead (e.g., "word must end with ing").
- Exclude special cases (e.g., "numbers not followed by %").

## Complex and real time expressions

In [None]:
text = "prices: $50, €100, $75"

pattern = r"\w+(?=: \$)"
results = re.findall(pattern, text)
print(f"'\\w+(?=: \\$)' (words before dollar sign): {results}")

'\w+(?=: \$)' (words before dollar sign): ['prices']


In [None]:
pattern = r"\$\d+(?![0-9])"
results = re.findall(pattern, text)
print(results)

['$50', '$75']


In [None]:
text = "Sites: https://example.com, http://test.org, https://demo.net"
pattern = r"https?://\w+\.\w+(?!\.org)"
results = re.findall(pattern, text)
print(results)

['https://example.com', 'http://test.org', 'https://demo.net']


In [None]:
pattern = r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
text = "Some text here.\nsupport@example.com is our email."

In [None]:
response = re.findall(pattern, text, re.MULTILINE)

In [None]:
response

['support@example.com']