# Text Processing For NLP with the help of Regex

# Regex allows us to perform various operations on text, such as searching for patterns, replacing text, and splitting strings.

# Import the 're' Module

In [1]:
import re

# Sample Text

In [19]:
text = '''Jack quickly realized that the big, lazy dog was blocking his path. The sun was setting, casting a golden hue over the quiet village.
Suddenly, a quick brown fox darted across the road, making Jack jump back in surprise. Nearby, a wizardly figure muttered strange words,
causing an old box to glow and hover. Vexed by the unexpected magic, Jack decided to take a different route home, thinking about the odd
encounter all evening.'''

# Basic Regex Operations

# Searching for a Pattern

In [20]:
pattern = r"Jack"  # Regular expression pattern to match "Jack
matches = re.findall(pattern, text)
print("Matches:", matches)

Matches: ['Jack', 'Jack', 'Jack']


# Replacing Text

In [21]:
pattern = r"Jack"
replacement = "Pranay"
new_text = re.sub(pattern, replacement, text)
print("New Text:", new_text)

New Text: Pranay quickly realized that the big, lazy dog was blocking his path. The sun was setting, casting a golden hue over the quiet village. 
Suddenly, a quick brown fox darted across the road, making Pranay jump back in surprise. Nearby, a wizardly figure muttered strange words, 
causing an old box to glow and hover. Vexed by the unexpected magic, Pranay decided to take a different route home, thinking about the odd 
encounter all evening.


# Splitting Strings

In [22]:
pattern = r"\s"  # Split on spaces
split_text = re.split(pattern, text)
print("Split Text:", split_text)

Split Text: ['Jack', 'quickly', 'realized', 'that', 'the', 'big,', 'lazy', 'dog', 'was', 'blocking', 'his', 'path.', 'The', 'sun', 'was', 'setting,', 'casting', 'a', 'golden', 'hue', 'over', 'the', 'quiet', 'village.', '', 'Suddenly,', 'a', 'quick', 'brown', 'fox', 'darted', 'across', 'the', 'road,', 'making', 'Jack', 'jump', 'back', 'in', 'surprise.', 'Nearby,', 'a', 'wizardly', 'figure', 'muttered', 'strange', 'words,', '', 'causing', 'an', 'old', 'box', 'to', 'glow', 'and', 'hover.', 'Vexed', 'by', 'the', 'unexpected', 'magic,', 'Jack', 'decided', 'to', 'take', 'a', 'different', 'route', 'home,', 'thinking', 'about', 'the', 'odd', '', 'encounter', 'all', 'evening.']


# Character Classes and Quantifiers

### Character Classes

In [23]:
pattern = r"[aeiou]"
vowels = re.findall(pattern, text)
print("Vowels:", vowels)

Vowels: ['a', 'u', 'i', 'e', 'a', 'i', 'e', 'a', 'e', 'i', 'a', 'o', 'a', 'o', 'i', 'i', 'a', 'e', 'u', 'a', 'e', 'i', 'a', 'i', 'a', 'o', 'e', 'u', 'e', 'o', 'e', 'e', 'u', 'i', 'e', 'i', 'a', 'e', 'u', 'e', 'a', 'u', 'i', 'o', 'o', 'a', 'e', 'a', 'o', 'e', 'o', 'a', 'a', 'i', 'a', 'u', 'a', 'i', 'u', 'i', 'e', 'e', 'a', 'a', 'i', 'a', 'i', 'u', 'e', 'u', 'e', 'e', 'a', 'e', 'o', 'a', 'u', 'i', 'a', 'o', 'o', 'o', 'o', 'a', 'o', 'e', 'e', 'e', 'e', 'u', 'e', 'e', 'e', 'a', 'i', 'a', 'e', 'i', 'e', 'o', 'a', 'e', 'a', 'i', 'e', 'e', 'o', 'u', 'e', 'o', 'e', 'i', 'i', 'a', 'o', 'u', 'e', 'o', 'e', 'o', 'u', 'e', 'a', 'e', 'e', 'i']


### Quantifiers

In [27]:
pattern = r"a{10}"  # Matches two consecutive 'a' characters
matches = re.findall(pattern, text)
print("Matches:", matches)

Matches: []


# Metacharacters and Anchors

### Metacharacters

In [30]:
pattern = r"\b\w{5}\b"  # Matches five-letter words
matches = re.findall(pattern, text)
print("Matches:", matches)

Matches: ['quiet', 'quick', 'brown', 'words', 'hover', 'Vexed', 'magic', 'route', 'about']


### Anchors

In [32]:
pattern = r"^Jack"  # Matches 'The' at the beginning of the string
matches = re.findall(pattern, text)
print("Matches:", matches)

Matches: ['Jack']


# Grouping and Capturing

In [33]:
pattern = r"(\w+)\s(\w+)"  # Matches two consecutive words
matches = re.findall(pattern, text)
print("Matches:", matches)

Matches: [('Jack', 'quickly'), ('realized', 'that'), ('the', 'big'), ('lazy', 'dog'), ('was', 'blocking'), ('his', 'path'), ('The', 'sun'), ('was', 'setting'), ('casting', 'a'), ('golden', 'hue'), ('over', 'the'), ('quiet', 'village'), ('a', 'quick'), ('brown', 'fox'), ('darted', 'across'), ('the', 'road'), ('making', 'Jack'), ('jump', 'back'), ('in', 'surprise'), ('a', 'wizardly'), ('figure', 'muttered'), ('strange', 'words'), ('causing', 'an'), ('old', 'box'), ('to', 'glow'), ('and', 'hover'), ('Vexed', 'by'), ('the', 'unexpected'), ('Jack', 'decided'), ('to', 'take'), ('a', 'different'), ('route', 'home'), ('thinking', 'about'), ('the', 'odd'), ('encounter', 'all')]


# Using Regex for Tokenization
Regex is also useful for tokenizing text into words or sentences.

# Tokenizing into Words

In [34]:
pattern = r"\s"  # Split on spaces
words = re.split(pattern, text)
print("Words:", words)

Words: ['Jack', 'quickly', 'realized', 'that', 'the', 'big,', 'lazy', 'dog', 'was', 'blocking', 'his', 'path.', 'The', 'sun', 'was', 'setting,', 'casting', 'a', 'golden', 'hue', 'over', 'the', 'quiet', 'village.', '', 'Suddenly,', 'a', 'quick', 'brown', 'fox', 'darted', 'across', 'the', 'road,', 'making', 'Jack', 'jump', 'back', 'in', 'surprise.', 'Nearby,', 'a', 'wizardly', 'figure', 'muttered', 'strange', 'words,', '', 'causing', 'an', 'old', 'box', 'to', 'glow', 'and', 'hover.', 'Vexed', 'by', 'the', 'unexpected', 'magic,', 'Jack', 'decided', 'to', 'take', 'a', 'different', 'route', 'home,', 'thinking', 'about', 'the', 'odd', '', 'encounter', 'all', 'evening.']


# Tokenizing into Sentences

In [35]:
pattern = r"\.\s"  # Split after periods followed by a space
sentences = re.split(pattern, text)
print("Sentences:", sentences)

Sentences: ['Jack quickly realized that the big, lazy dog was blocking his path', 'The sun was setting, casting a golden hue over the quiet village', '\nSuddenly, a quick brown fox darted across the road, making Jack jump back in surprise', 'Nearby, a wizardly figure muttered strange words, \ncausing an old box to glow and hover', 'Vexed by the unexpected magic, Jack decided to take a different route home, thinking about the odd \nencounter all evening.']
