# Regular Expressions (RegEx) Cheat Sheet with Examples

In [2]:
# re is a module in Python that provides support for regular expressions. It is a powerful tool for pattern matching and text manipulation.
import re

# Basic Patterns

In [None]:
# \d - Matches any digit (0-9). It will return a list of all the digits in the string.
# In the followng code, the r before the string is used to treat the string as a raw string,
# meaning that backslashes are not interpreted as escape characters.
print(re.findall(r'\d', 'I have 3 apples and 5 oranges'))  

In [None]:
# \w - Matches any alphanumeric character (a-z, A-Z, 0-9, _)
print(re.findall(r'\w', 'Hello, World!+@'))  
print(re.findall(r'\w', 'This is one (1) underscore (_) and one (1) dash (-)'))  
# If we use search, it will return the first match only
match = re.search(r'\w', 'Hello, World!+@') 
print(match.group()) 

In [None]:
# \s - Matches any whitespace character (space, tab, newline)
print(re.findall(r'\s', 'Hello World\tTab\nNewline'))  

In [None]:
# . - Matches any character between two characters except newline
# This will return a list of all substrings that match the pattern
print(re.findall(r'h.t', 'hit hat HAT Hat hot but h+t h\tt h\nt heat height hotel'))  

# Quantifiers

In [None]:
# * - The asterisk (*) is a quantifier in regular expressions that Matches 0 or more occurrences of the preceding character or group. 
# * says "repeat the thing just before me as many times as you like—including not at all."
# Think of it as "zero, one, or many."
# In the pattern ab*c
# The letter a must be there once.
# The letter b can show up 0, 1, 2, … times, because it is followed by *.
# The letter c must be there once, right after the last b.
# So the pattern matches: ac, abc, abbc, abbbc, and so on.
print(1, re.findall(r'ab*c', 'ac'))        # Matches: 'ac'
print(2, re.findall(r'ab*c', 'abc'))       # Matches: 'abc'
print(3, re.findall(r'ab*c', 'abbc'))      # Matches: 'abbc'
print(4, re.findall(r'ab*c', 'abbbc'))     # Matches: 'abbbc'
print(5, re.findall(r'ab*c', 'abbbbc'))    # Matches: 'abbbbc'
print(6, re.findall(r'ab*c', 'fgt'))       # Does not match
print(7, re.findall(r'ab*c', 'cab'))       # Does not match
print(8, re.findall(r'ab*c', 'atc'))       # Does not match
print(9, re.findall(r'ab*c', 'bca'))       # Does not match
print(10, re.findall(r'ab*c', 'cba'))      # Does not match

In [None]:
# + - The + quantifier applies to the character or group immediately preceding it.
# + requires at least one occurrence of the preceding element.
# Remember that * allows for zero or more occurrences, meaning the element can be completely absent and still result in a match.
print(1, re.findall(r'ab+c', 'ac'))       
print(2, re.findall(r'ab+c', 'abc'))      
print(3, re.findall(r'ab+c', 'abbc'))      
print(4, re.findall(r'ab+c', 'abbbc'))     
print(5, re.findall(r'ab+c', 'abbbbc'))    
print(6, re.findall(r'ab+c', 'fgt'))      
print(7, re.findall(r'ab+c', 'cab'))        
print(8, re.findall(r'ab+c', 'atc'))      
print(9, re.findall(r'ab+c', 'bca'))      
print(10, re.findall(r'ab+c', 'cba'))     

In [None]:
# ? - means the preceding character is optional (0 or 1 times)
print(1, re.findall(r'honou?r', 'honor honour'))
print(2, re.findall(r'labou?r', 'labor labour'))
print(3, re.findall(r'neighbou?r', 'neighbor neighbour neighbouring'))
print(4, re.findall(r'colou?ring', 'coloring colouring colourising'))
print(5, re.findall(r'modell?ing', 'modeling modelling model'))
print(6, re.findall(r'valu?e', 'vale value evaluate'))

In [None]:
# {n} - matches exactly n occurrences of the preceding pattern
# Matches any 3-digit number (exactly three digits in a row)
print(1, re.findall(r'\d{3}', '123 456 78 9012'))
# Matches 2 digits followed by a space, then 2 digits. BUT! re.findall() by default does not return overlapping matches.
print(2, re.findall(r'\d{2}\s{1}\d{2}', '12 34 56 78'))
# Matches a digit, then a space, then a word character, all repeated 2 times
print(3, re.findall(r'(\d{1}\s{1}\w{1}){2}', '1 a2 b 3c'))
# Matches any 4-digit number
print(4, re.findall(r'\d{4}', 'Year 2023, code 1234, pin 9876'))
# Matches 1 word character, 1 digit, and 1 space, repeated pattern
print(5, re.findall(r'(\w{1}\d{1}\s{1}){2}', 'a1 b2 c3'))
# Matches 2 word characters followed by exactly 3 digits
print(6, re.findall(r'\w{2}\d{3}', 'ab123 cd456 ef789'))
# Matches 3 digits followed by 1 word character
print(7, re.findall(r'\d{3}\w{1}', '123a 456b 789c'))
# Matches any sequence of 5 digits
print(8, re.findall(r'\d{5}', 'Zip: 90210 Code: 12345'))
# Matches 1 digit, 1 word character, and 1 space
print(9, re.findall(r'\d{1}\w{1}\s{1}', '1a 2b 3c 4d'))

In [None]:
# {n,m} - matches between n and m occurrences
print(re.findall(r'\d{2,4}', '0 12 345 6789 101112 13141516'))  # matches 2 - 4 digits in a row

# Character Classes and Anchors

In [None]:
# [abc] - matches any one character from the set
print(re.findall(r'[aeiou]', 'hello world'))  # matches vowels

In [None]:
# [^abc] - matches any one character NOT in the set
print(re.findall(r'[^aeiou]', 'hello'))  # matches non-vowel characters

In [None]:
# [a-z] - matches any one character in the range
print(re.findall(r'[a-c]', 'abcdefgAB'))  # matches a, b, or c

In [None]:
# ^ - matches the start of the string
print(re.findall(r'^hello', 'hello world\nhello there'))  # matches 'hello' only if at the start

In [None]:
# $ - matches the end of the string
print(re.findall(r'world$', 'hello world\nbye world'))  # matches 'world' only if at the end

# Special Patterns

In [None]:
# The parentheses () create a capturing group that extracts matched portions
# Multiple groups like (abc)-(def) capture separate parts of a pattern
# \d+ matches one or more digits (0-9)
# In the example below, (\d+)-(\d+) captures two groups of digits separated by a hyphen
my_str = 'Order: 123-456 has been processed but 789-012 is pending'
print(re.findall(r'(\d+)-(\d+)', my_str))

In [None]:
match = re.search(r'(\d+)-(\d+)', my_str)
print(match.groups())  # returns matched groups
# If you want to find all occurrences of the pattern in the string, you should use re.findall

In [None]:
# If you want to find all occurrences of the pattern in the string, you should use re.findall
matches = re.findall(r'(\d+)-(\d+)', my_str)
for match in matches:
    print(match) 

In [None]:
# | - means OR between patterns

my_str = 'I have a cat and a dog, and another cat'

# The re.search function finds the first occurrence of either "cat" or "dog"
match = re.search(r'cat|dog', my_str)
print(match.group())  

# The re.findall function finds all occurrences of either "cat" or "dog"
matches = re.findall(r'cat|dog', my_str)
print(matches)  

# Use a set to get unique matches
unique_matches = set(matches)
print(unique_matches)

# Common Use Cases

In [None]:
import re

# Email pattern:
# r"..." makes it a raw string, so Python doesn't interpret backslashes (\) as escape characters.

# [a-zA-Z0-9_.+-]+       # Matches one or more of:
#                        #   letters (a-z, A-Z)
#                        #   digits (0-9)
#                        #   underscore (_), dot (.), plus (+), or hyphen (-)
#                        # This is the local part of the email before the "@" symbol.

# @                      # Matches the "@" symbol that separates local and domain parts.

# [a-zA-Z0-9-]+          # Matches one or more of:
#                        #   letters, digits, or hyphens (-)
#                        # This is the main domain name (e.g., "example").

# \.                     # Matches a literal dot "." between the domain and TLD.

# [a-zA-Z0-9-.]+         # Matches one or more of:
#                        #   letters, digits, hyphens, or additional dots
#                        # Supports subdomains and multi-part TLDs (e.g., "co.uk", "mail-server.com").

email_pattern = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"

text = "Contact us at support@example.com, sales@company.co.uk, or info@mail-server.net."

emails = re.findall(email_pattern, text)
print(emails)

In [None]:
# URL pattern:
# This regex matches URLs with protocols (http, https, ftp) embedded within a larger text
url_pattern = r"(?:https?|ftp)://[^\s/$.?#].[^\s]*"

# Explanation of the pattern:

# (?:https?|ftp)         # Non-capturing group for the protocol part:
#                        #   http        → matches 'http'
#                        #   s?          → optionally matches 's' (so it matches 'https' too)
#                        #   | ftp       → or matches 'ftp'
#                        #   (?:...)     → non-capturing group to avoid breaking re.findall()

# ://                   # Literal match of '://'
#                       #   / is escaped as \/ to match actual forward slashes

# [^\s/$.?#]            # The first character after '://' must not be:
#                       #   \s           → whitespace (space, newline, tab)
#                       #   / $. ? #     → common URL terminators or delimiters
#                       # This ensures we don't accidentally match something like 'http://.com'

# .                     # Matches any single character (except newline)
#                       # Used with the previous char class to require at least one valid character

# [^\s]*                # Matches the rest of the URL, consisting of zero or more non-whitespace characters
#                       # This allows matching full domain, path, query strings, etc.

print(re.findall(url_pattern, 'Visit https://example.com or http://test.org'))  # matches URLs

In [None]:
# You can use the re.sub function to replace all occurrences of a pattern with a new string
print(re.sub(url_pattern, 'URL', 'Visit https://example.com or http://test.org'))  # replaces URLs with 'URL'

In [None]:
# Phone number pattern:
phone_pattern = (
    r"\b"                 # word boundary – avoids matching inside longer strings
    r"(?:\+?1[-.\s]?)?"   # optional country code: +1, 1-, 1 , …
    r"(?:\([2-9]\d{2}\)|[2-9]\d{2})"  # area code: (555) or 555; first digit 2-9
    r"[-.\s]?"            # optional separator (dash, dot, space, or nothing)
    r"[2-9]\d{2}"         # prefix; first digit 2-9
    r"[-.\s]?"            # optional separator
    r"\d{4}"              # line number
    r"\b"                 # closing word boundary
)

text = "Call (123) 456-7890, 987.654.3210 or +1 800 555 1212 now!"
print(re.findall(phone_pattern, text))

In [None]:
# Decimal number pattern:
# \d+ - Matches one or more digits before the decimal point
# \. - Matches a literal dot (escaped with \) since dot is a special character in regex that matches any character
# \d+ - Matches one or more digits after the decimal point
# This pattern will find numbers like 10.99, 3.14, 0.5, etc. in text
print(re.findall(r'\d+\.\d+', 'Prices: from 10.99 to 14.99'))  