In [1]:
import re

In [2]:
# Patterns

# .       - Any Character Except New Line  
# \d      - Digit (0-9)  
# \D      - Not a Digit (0-9)  
# \w      - Word Character (a-z, A-Z, 0-9, _)  
# \W      - Not a Word Character  
# \s      - Whitespace (space, tab, newline)  
# \S      - Not Whitespace (space, tab, newline)  
  
# \b      - Word Boundary  
# \B      - Not a Word Boundary  
# ^       - Beginning of a String  
# $       - End of a String  
  
# []      - Matches Characters in brackets - character set
# [^ ]    - Matches Characters NOT in brackets  
# |       - Either Or  
# ( )     - Group  

# Quantifiers:  
# *       - 0 or More  
# +       - 1 or More  
# ?       - 0 or One  
# {3}     - Exact Number  
# {3,4}   - Range of Numbers (Minimum, Maximum)  

# https://www.youtube.com/watch?v=K8L6KVGG-7o

# https://www.machinelearningplus.com/python/python-regex-tutorial-examples/

# example question and answers
# https://www.sanfoundry.com/python-questions-answers-regular-expressions/

In [3]:
# r makes string raw, without it string will be rendered
print(r'\tT')

print('\tT')

\tT
	T


In [4]:
urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

# compiles regular expression
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')

# substitute groups 2 and 3
subbed_urls = pattern.sub(r'\2\3', urls)

print(subbed_urls)


google.com
coreyms.com
youtube.com
nasa.gov



In [5]:
# iterate over patterns
matches = pattern.finditer(urls)

for match in matches:
    print(match)

print()

matches = pattern.finditer(urls)
# group(0) returns only string
for match in matches:
    print(match.group(0))

<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 42), match='http://coreyms.com'>
<re.Match object; span=(43, 62), match='https://youtube.com'>
<re.Match object; span=(63, 83), match='https://www.nasa.gov'>

https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov


In [6]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890
Ha HaHa
MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )
coreyms.com
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

In [7]:
pattern = re.compile(r'M(rs|r|s)\.?\s[A-Z]\w*')

matches = pattern.finditer(text_to_search)

for match in matches:
    print(match)

<re.Match object; span=(216, 227), match='Mr. Schafer'>
<re.Match object; span=(228, 236), match='Mr Smith'>
<re.Match object; span=(237, 245), match='Ms Davis'>
<re.Match object; span=(246, 259), match='Mrs. Robinson'>
<re.Match object; span=(260, 265), match='Mr. T'>


In [8]:
# findall returns only first group
matches = pattern.findall(text_to_search)

for match in matches:
    print(match)

r
r
s
rs
r


In [9]:
# Search only returns first pattern found
matches = pattern.search(text_to_search)

print(matches)

<re.Match object; span=(216, 227), match='Mr. Schafer'>


In [10]:
sentence = 'Start a sentence and then bring it to an end'

# IGNORECASE ignores case
pattern = re.compile(r'start', re.IGNORECASE)
matches = pattern.search(sentence)
print(matches)

<re.Match object; span=(0, 5), match='Start'>
