In [4]:
# https://regex101.com/

In [2]:
"This is a \"special\" string"

'This is a "special" string'

In [6]:
# special characters/meta characters

In [8]:
# .
# \. only.
# d
# \d digits
#\d\d\d\d\d -5 digits
# \D everything else except digits
# \w alphanumeric
# \s spaces
# \S everything else except spaces

In [16]:
# \bHa -- starts with Ha in all words
# \BHa -- other than that search for Ha
# ^ab -- beginning of the string
# \bab -- checks for ab in all words starting


In [4]:
# abcdefghijklmnopqurtuvwxyz.
# ABCDEFGHIJKLMNOPQRSTUVWXYZ abc
# 1234567890.
# abcdefe_ghijklHa
# Ha HaHaHaHaHaHa
# [$v\()}]*
# website.com
# 321-555-4321
# 123.555.1234.
# Mr.Pavan
# Mr.Sai
# Ms.Kumar
# Mrs. Singh.
# Mrs.Himanshi Varma
# Ms.Kalpana Roy

In [18]:
# \d\d\d.\d\d\d.\d\d\d\d -- for numbers
# character sets
# \d\d\d[.-]\d\d\d[.-]\d\d\d\d
# \d{3}[.-]\d{3}[.-]\d{4}
# [a-f] - from a to f
# [^a-f] - not from a to f
# [2-6]
# [6adc]
# a+ -- a appearing 1 or more times
# a* -- a appearing 0 or more times
# a? -- a appearing 0 or 1 time

# M(r|s|rs)\.?\s*[A-Z]{1}[a-z]+[ ]?[A-Z]?[a-z]*


In [6]:
import re
import pandas as pd
import numpy as np

In [10]:
import re

email_regex = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'

def is_valid_email(email):
    return re.match(email_regex, email) is not None


In [12]:
print(is_valid_email("test@example.com"))
print(is_valid_email("invalid-email@"))    

True
False


In [None]:
# ^ → Start of the string.
# [a-zA-Z0-9._%+-]+ → Matches the local part (before @).
# @ → Matches the @ symbol.
# [a-zA-Z0-9.-]+ → Matches the domain name.
# \. → Matches the dot before the domain extension.
# [a-zA-Z]{2,} → Matches the domain extension (at least 2 letters).
# $ → End of the string.

In [28]:
# re.match(pattern, string)
# Matches only at the beginning of the string.
# Returns a match object if successful, else None.

In [26]:
import re

pattern = r"Hello"
text = "  Today is a good day, Hello World!"
# text = " Hello World! Today is a good day, "

match = re.match(pattern, text)
if match:
    print("Match found:", match.group())
else:
    print("No match")

No match


In [None]:
# re.search(pattern, string)
# Searches the entire string for a match.
# Returns the first occurrence of the match.

In [22]:
text = "Hi, Python is amazing! Learn Python today."
pattern = r"Python"

match = re.search(pattern, text)
if match:
    print("Match found:", match.group())  


Match found: Python


In [38]:
# re.findall(pattern, string)
# Finds all occurrences of the pattern in the string.
# Returns a list of matches.

In [28]:
text = "Email me at test@example.com or contact@example.com or example@com"
pattern = r"\w+@\w+\.\w+"

emails = re.findall(pattern, text)
print(emails)


['test@example.com', 'contact@example.com']


In [42]:
# re.finditer(pattern, string)
# Like findall(), but returns an iterator of match objects instead of a list.

In [30]:
text = "There are 3 apples and 5 bananas."
pattern = r"\d+" 

matches = re.finditer(pattern, text)
for i in matches:
    print("Number found:", i.group())

Number found: 3
Number found: 5


In [46]:
# re.sub(pattern, replacement, string)
# Replaces occurrences of the pattern with a replacement string.

In [32]:
text = "The color is red"
pattern = r"color"
replacement = "colour"

new_text = re.sub(pattern, replacement, text)
print(new_text) 

The colour is red


In [54]:
# re.split(pattern, string)
# Splits the string by occurrences of the pattern.

In [36]:
text = "apple, banana; grape|orange"
pattern = r"[,;|]"  

fruits = re.split(pattern, text)
print(fruits) 


['apple', ' banana', ' grape', 'orange']


In [None]:
# .	Matches any character except newline	
# h.t → matches hat, hit, hot
# ^	Matches the start of a string	
# ^hello → matches hello world but not say hello
# $	Matches the end of a string	
# world$ → matches hello world but not world hello
# \d	Matches any digit (0-9)	
# \d+ → matches 123 in abc123
# \D	Matches any non-digit	
# \D+ → matches abc in abc123
# \w	Matches any alphanumeric character (letters & digits)	
# \w+ → matches Hello123 in Hello123!
# \W	Matches any non-word character	
# \W+ → matches ! in Hello123!
# \s	Matches any whitespace (space, tab, newline)	
# \s+ → matches spaces in Hello World
# \S	Matches any non-whitespace	
# \S+ → matches Hello in Hello World
# +	Matches one or more occurrences	
# \d+ → matches 123 in abc123
# *	Matches zero or more occurrences	
# a* → matches aaa in aaaabc and also b in bcd
# ?	Matches zero or one occurrence	
# colou?r → matches color and colour
# {n,m}	Matches between n and m repetitions	
# \d{2,4} → matches 12, 123, or 1234
# `	`	Acts as OR operator
# ( )	Groups expressions	
# (ab)+ → matches ab, abab, ababab

In [58]:
# If you plan to use a regex multiple times, it's more efficient to compile it first.

In [60]:
pattern = re.compile(r"\d+")  # Matches numbers

text = "Order 123 is ready. Order 456 is pending."
matches = pattern.findall(text)
print(matches) 


['123', '456']


In [62]:
# re.IGNORECASE (re.I)	Case-insensitive matching
# re.MULTILINE (re.M)	Allows ^ and $ to match start and end of each line
# re.DOTALL (re.S)	Allows . to match newlines

In [44]:
text = "Python is fun. PYTHON is powerful."
pattern = re.compile(r"python", re.I)

matches = pattern.findall(text)
print(matches)  

['Python', 'PYTHON']


In [56]:
text = "My phone number is (123) 456-7890."
pattern = r"\((\d{3})\) (\d{3})-(\d{4})"

match = re.search(pattern, text)
if match:
    print("Area code:", match.group(1))  
    print("First 3 digits:", match.group(2))  
    print("Last 4 digits:", match.group(3))
else:
    print('No match')

Area code: 123
First 3 digits: 456
Last 4 digits: 7890
