# Regular Expressions


\.      - Any Character Except New Line

\d      - Digit (0-9)

\D      - Not a Digit (0-9)

\w      - Word Character (a-z, A-Z, 0-9, _)

\W      - Not a Word Character

\s      - Whitespace (space, tab, newline)

\S      - Not Whitespace (space, tab, newline)


\b      - Word Boundary

\B      - Not a Word Boundary

^       - Beginning of a String

$       - End of a String

[]      - Matches Characters in brackets

[^ ]    - Matches Characters NOT in brackets

|       - Either Or

( )     - Group

**Quantifiers:**

\*       - 0 or More

\+       - 1 or More

?       - 0 or One

{3}     - Exact Number

{3,4}   - Range of Numbers (Minimum, Maximum)


#### Sample Regexs ####

[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+

In [1]:
import re

## Raw strings

In [2]:
text_to_search = '''
abcdefghijklmnopqurtuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890

Ha HaHa

MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )

coreyms.com

321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234

Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

sentence = 'Start a sentence and then bring it to an end'

## Search for patterns basics

### Search for abc (case sensitive)

In [3]:
## Specify pattern
pattern = re.compile(r'abc')
type(pattern)

re.Pattern

In [4]:
matches = pattern.finditer(text_to_search)

In [5]:
for match in matches:
    print(match)

<re.Match object; span=(1, 4), match='abc'>


span is where **abc** was found. (position 1-4)

In [6]:
text_to_search[1:4]

'abc'

### Search for cba (case sensitive)

In [7]:
pattern = re.compile(r'cba')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

### Search for .

. is a special character so use a \ to find the period.

Characters that need to be escaped:

. ^ $ * + ? { } [ ] \ | ( )

In [8]:
pattern = re.compile(r'\.')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(113, 114), match='.'>
<re.Match object; span=(149, 150), match='.'>
<re.Match object; span=(171, 172), match='.'>
<re.Match object; span=(175, 176), match='.'>
<re.Match object; span=(223, 224), match='.'>
<re.Match object; span=(254, 255), match='.'>
<re.Match object; span=(267, 268), match='.'>


### Search for coreyms.com

In [9]:
pattern = re.compile(r'coreyms\.com')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(142, 153), match='coreyms.com'>


### Search for any digit 0-9

In [10]:
pattern = re.compile(r'\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(55, 56), match='1'>
<re.Match object; span=(56, 57), match='2'>
<re.Match object; span=(57, 58), match='3'>
<re.Match object; span=(58, 59), match='4'>
<re.Match object; span=(59, 60), match='5'>
<re.Match object; span=(60, 61), match='6'>
<re.Match object; span=(61, 62), match='7'>
<re.Match object; span=(62, 63), match='8'>
<re.Match object; span=(63, 64), match='9'>
<re.Match object; span=(64, 65), match='0'>
<re.Match object; span=(155, 156), match='3'>
<re.Match object; span=(156, 157), match='2'>
<re.Match object; span=(157, 158), match='1'>
<re.Match object; span=(159, 160), match='5'>
<re.Match object; span=(160, 161), match='5'>
<re.Match object; span=(161, 162), match='5'>
<re.Match object; span=(163, 164), match='4'>
<re.Match object; span=(164, 165), match='3'>
<re.Match object; span=(165, 166), match='2'>
<re.Match object; span=(166, 167), match='1'>
<re.Match object; span=(168, 169), match='1'>
<re.Match object; span=(169, 170), match='2'>
<re.Matc

### Search for any non digit

In [11]:
pattern = re.compile(r'\D')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='\n'>
<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(3, 4), match='c'>
<re.Match object; span=(4, 5), match='d'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='f'>
<re.Match object; span=(7, 8), match='g'>
<re.Match object; span=(8, 9), match='h'>
<re.Match object; span=(9, 10), match='i'>
<re.Match object; span=(10, 11), match='j'>
<re.Match object; span=(11, 12), match='k'>
<re.Match object; span=(12, 13), match='l'>
<re.Match object; span=(13, 14), match='m'>
<re.Match object; span=(14, 15), match='n'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(16, 17), match='p'>
<re.Match object; span=(17, 18), match='q'>
<re.Match object; span=(18, 19), match='u'>
<re.Match object; span=(19, 20), match='r'>
<re.Match object; span=(20, 21), match='t'>
<re.Match object; span=(21, 22), match='u'>
<re.Match object; span=(22, 23), match='v'>
<re.Ma

### Search for word character (a-z, A-Z, 0-9, _)

In [12]:
pattern = re.compile(r'\w')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(3, 4), match='c'>
<re.Match object; span=(4, 5), match='d'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='f'>
<re.Match object; span=(7, 8), match='g'>
<re.Match object; span=(8, 9), match='h'>
<re.Match object; span=(9, 10), match='i'>
<re.Match object; span=(10, 11), match='j'>
<re.Match object; span=(11, 12), match='k'>
<re.Match object; span=(12, 13), match='l'>
<re.Match object; span=(13, 14), match='m'>
<re.Match object; span=(14, 15), match='n'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(16, 17), match='p'>
<re.Match object; span=(17, 18), match='q'>
<re.Match object; span=(18, 19), match='u'>
<re.Match object; span=(19, 20), match='r'>
<re.Match object; span=(20, 21), match='t'>
<re.Match object; span=(21, 22), match='u'>
<re.Match object; span=(22, 23), match='v'>
<re.Match object; span=(23, 24), match='w'>
<re.M

### Search for not a word character

In [13]:
pattern = re.compile(r'\W')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='\n'>
<re.Match object; span=(27, 28), match='\n'>
<re.Match object; span=(54, 55), match='\n'>
<re.Match object; span=(65, 66), match='\n'>
<re.Match object; span=(66, 67), match='\n'>
<re.Match object; span=(69, 70), match=' '>
<re.Match object; span=(74, 75), match='\n'>
<re.Match object; span=(75, 76), match='\n'>
<re.Match object; span=(90, 91), match=' '>
<re.Match object; span=(91, 92), match='('>
<re.Match object; span=(96, 97), match=' '>
<re.Match object; span=(99, 100), match=' '>
<re.Match object; span=(102, 103), match=' '>
<re.Match object; span=(110, 111), match=')'>
<re.Match object; span=(111, 112), match=':'>
<re.Match object; span=(112, 113), match='\n'>
<re.Match object; span=(113, 114), match='.'>
<re.Match object; span=(114, 115), match=' '>
<re.Match object; span=(115, 116), match='^'>
<re.Match object; span=(116, 117), match=' '>
<re.Match object; span=(117, 118), match='$'>
<re.Match object; span=(118, 119), match=' '>
<re.M

### Search for whitespace (space, tab, newline)

In [14]:
pattern = re.compile(r'\s')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='\n'>
<re.Match object; span=(27, 28), match='\n'>
<re.Match object; span=(54, 55), match='\n'>
<re.Match object; span=(65, 66), match='\n'>
<re.Match object; span=(66, 67), match='\n'>
<re.Match object; span=(69, 70), match=' '>
<re.Match object; span=(74, 75), match='\n'>
<re.Match object; span=(75, 76), match='\n'>
<re.Match object; span=(90, 91), match=' '>
<re.Match object; span=(96, 97), match=' '>
<re.Match object; span=(99, 100), match=' '>
<re.Match object; span=(102, 103), match=' '>
<re.Match object; span=(112, 113), match='\n'>
<re.Match object; span=(114, 115), match=' '>
<re.Match object; span=(116, 117), match=' '>
<re.Match object; span=(118, 119), match=' '>
<re.Match object; span=(120, 121), match=' '>
<re.Match object; span=(122, 123), match=' '>
<re.Match object; span=(124, 125), match=' '>
<re.Match object; span=(126, 127), match=' '>
<re.Match object; span=(128, 129), match=' '>
<re.Match object; span=(130, 131), match=' '>
<re

### Search for not whitespace (~space, ~tab, ~newline)

In [15]:
pattern = re.compile(r'\S')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(3, 4), match='c'>
<re.Match object; span=(4, 5), match='d'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='f'>
<re.Match object; span=(7, 8), match='g'>
<re.Match object; span=(8, 9), match='h'>
<re.Match object; span=(9, 10), match='i'>
<re.Match object; span=(10, 11), match='j'>
<re.Match object; span=(11, 12), match='k'>
<re.Match object; span=(12, 13), match='l'>
<re.Match object; span=(13, 14), match='m'>
<re.Match object; span=(14, 15), match='n'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(16, 17), match='p'>
<re.Match object; span=(17, 18), match='q'>
<re.Match object; span=(18, 19), match='u'>
<re.Match object; span=(19, 20), match='r'>
<re.Match object; span=(20, 21), match='t'>
<re.Match object; span=(21, 22), match='u'>
<re.Match object; span=(22, 23), match='v'>
<re.Match object; span=(23, 24), match='w'>
<re.M

### Search for Ha using word boundries
Will match the word Ha, and Ha*Ha* using a word boundry

In [16]:
pattern = re.compile(r'\bHa')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(67, 69), match='Ha'>
<re.Match object; span=(70, 72), match='Ha'>


Finds **Ha** and **Ha**Ha

In [17]:
text_to_search[67:74]

'Ha HaHa'

### Search for just the string Ha not using word boundries

In [18]:
pattern = re.compile(r'\BHa')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(72, 74), match='Ha'>


Finds Ha**Ha**

In [19]:
text_to_search[72:76]

'Ha\n\n'

### Find pattern at the beginning of a string (Start)

In [20]:
sentence

'Start a sentence and then bring it to an end'

In [21]:
pattern = re.compile(r'^Start')
matches = pattern.finditer(sentence)
for match in matches:
    print(match)

<re.Match object; span=(0, 5), match='Start'>


In [22]:
pattern = re.compile(r'^sentence')
matches = pattern.finditer(sentence)
for match in matches:
    print(match)

### Find pattern at the end of a string (end)

In [23]:
pattern = re.compile(r'end$')
matches = pattern.finditer(sentence)
for match in matches:
    print(match)

<re.Match object; span=(41, 44), match='end'>


In [24]:
pattern = re.compile(r'an$')
matches = pattern.finditer(sentence)
for match in matches:
    print(match)

## Practical examples

### Search for phone numbers
- xxx.xxx.xxxx

In [25]:
pattern = re.compile(r'\d\d\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(55, 58), match='123'>
<re.Match object; span=(58, 61), match='456'>
<re.Match object; span=(61, 64), match='789'>
<re.Match object; span=(155, 158), match='321'>
<re.Match object; span=(159, 162), match='555'>
<re.Match object; span=(163, 166), match='432'>
<re.Match object; span=(168, 171), match='123'>
<re.Match object; span=(172, 175), match='555'>
<re.Match object; span=(176, 179), match='123'>
<re.Match object; span=(181, 184), match='123'>
<re.Match object; span=(185, 188), match='555'>
<re.Match object; span=(189, 192), match='123'>
<re.Match object; span=(194, 197), match='800'>
<re.Match object; span=(198, 201), match='555'>
<re.Match object; span=(202, 205), match='123'>
<re.Match object; span=(207, 210), match='900'>
<re.Match object; span=(211, 214), match='555'>
<re.Match object; span=(215, 218), match='123'>


In [26]:
## 3 numbers and a dot
pattern = re.compile(r'\d\d\d\.')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(168, 172), match='123.'>
<re.Match object; span=(172, 176), match='555.'>


In [27]:
## Phone number with dots
pattern = re.compile(r'\d\d\d\.\d\d\d\.\d\d\d\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(168, 180), match='123.555.1234'>


Search for any character spacing

### Phone number with dots

In [28]:
pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(155, 167), match='321-555-4321'>
<re.Match object; span=(168, 180), match='123.555.1234'>
<re.Match object; span=(181, 193), match='123*555*1234'>
<re.Match object; span=(194, 206), match='800-555-1234'>
<re.Match object; span=(207, 219), match='900-555-1234'>


### Phone number with character set

In [29]:
pattern = re.compile(r'\d\d\d[-.]\d\d\d[-.]\d\d\d\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(155, 167), match='321-555-4321'>
<re.Match object; span=(168, 180), match='123.555.1234'>
<re.Match object; span=(194, 206), match='800-555-1234'>
<re.Match object; span=(207, 219), match='900-555-1234'>


### Find phone numbers in text file

In [30]:
with open('data.txt') as f:
    contents = f.read()
    print(contents[:175])

Dave Martin
615-555-7164
173 Main St., Springfield RI 55924
davemartin@bogusemail.com

Charles Harris
800-555-5669
969 High St., Atlantis VA 34075
charlesharris@bogusemail.com


### Search for any character between digits

Will match either the - or .

In [31]:
with open('data.txt', 'r') as f:
    contents = f.read()
    
    pattern = re.compile(r'\d\d\d[-.]\d\d\d[-.]\d\d\d\d')
    matches = pattern.finditer(contents)
    
    for match in matches:
        print(match)

<re.Match object; span=(12, 24), match='615-555-7164'>
<re.Match object; span=(102, 114), match='800-555-5669'>
<re.Match object; span=(191, 203), match='560-555-5153'>
<re.Match object; span=(281, 293), match='900-555-9340'>
<re.Match object; span=(378, 390), match='714-555-7405'>
<re.Match object; span=(467, 479), match='800-555-6771'>
<re.Match object; span=(557, 569), match='783-555-4799'>
<re.Match object; span=(647, 659), match='516-555-4615'>
<re.Match object; span=(740, 752), match='127-555-1867'>
<re.Match object; span=(831, 843), match='608-555-4938'>
<re.Match object; span=(917, 929), match='568-555-6051'>
<re.Match object; span=(1005, 1017), match='292-555-1875'>
<re.Match object; span=(1093, 1105), match='900-555-3205'>
<re.Match object; span=(1182, 1194), match='614-555-1166'>
<re.Match object; span=(1273, 1285), match='530-555-2676'>
<re.Match object; span=(1359, 1371), match='470-555-2750'>
<re.Match object; span=(1443, 1455), match='800-555-6089'>
<re.Match object; spa

### Match 800 or 900 number

In [32]:
pattern = re.compile(r'[89]00[-.]\d\d\d[-.]\d\d\d\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(194, 206), match='800-555-1234'>
<re.Match object; span=(207, 219), match='900-555-1234'>


In [33]:
with open('data.txt', 'r') as f:
    contents = f.read()
    
    pattern = re.compile(r'[89]00[-.]\d\d\d[-.]\d\d\d\d')
    matches = pattern.finditer(contents)
    
    for match in matches:
        print(match)

<re.Match object; span=(102, 114), match='800-555-5669'>
<re.Match object; span=(281, 293), match='900-555-9340'>
<re.Match object; span=(467, 479), match='800-555-6771'>
<re.Match object; span=(1093, 1105), match='900-555-3205'>
<re.Match object; span=(1443, 1455), match='800-555-6089'>
<re.Match object; span=(1794, 1806), match='800-555-7100'>
<re.Match object; span=(2055, 2067), match='900-555-5118'>
<re.Match object; span=(2830, 2842), match='900-555-5428'>
<re.Match object; span=(3290, 3302), match='800-555-8810'>
<re.Match object; span=(3977, 3989), match='900-555-9598'>
<re.Match object; span=(4951, 4963), match='800-555-2420'>
<re.Match object; span=(5572, 5584), match='900-555-3567'>
<re.Match object; span=(6195, 6207), match='800-555-3216'>
<re.Match object; span=(6897, 6909), match='900-555-7755'>
<re.Match object; span=(7872, 7884), match='800-555-1372'>
<re.Match object; span=(8751, 8763), match='900-555-6426'>


### Specify range of values between 1-5

In [34]:
pattern = re.compile(r'[1-5]')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(55, 56), match='1'>
<re.Match object; span=(56, 57), match='2'>
<re.Match object; span=(57, 58), match='3'>
<re.Match object; span=(58, 59), match='4'>
<re.Match object; span=(59, 60), match='5'>
<re.Match object; span=(155, 156), match='3'>
<re.Match object; span=(156, 157), match='2'>
<re.Match object; span=(157, 158), match='1'>
<re.Match object; span=(159, 160), match='5'>
<re.Match object; span=(160, 161), match='5'>
<re.Match object; span=(161, 162), match='5'>
<re.Match object; span=(163, 164), match='4'>
<re.Match object; span=(164, 165), match='3'>
<re.Match object; span=(165, 166), match='2'>
<re.Match object; span=(166, 167), match='1'>
<re.Match object; span=(168, 169), match='1'>
<re.Match object; span=(169, 170), match='2'>
<re.Match object; span=(170, 171), match='3'>
<re.Match object; span=(172, 173), match='5'>
<re.Match object; span=(173, 174), match='5'>
<re.Match object; span=(174, 175), match='5'>
<re.Match object; span=(176, 177), match='1'

### Specify range of values between a-z

In [35]:
pattern = re.compile(r'[a-z]')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(3, 4), match='c'>
<re.Match object; span=(4, 5), match='d'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='f'>
<re.Match object; span=(7, 8), match='g'>
<re.Match object; span=(8, 9), match='h'>
<re.Match object; span=(9, 10), match='i'>
<re.Match object; span=(10, 11), match='j'>
<re.Match object; span=(11, 12), match='k'>
<re.Match object; span=(12, 13), match='l'>
<re.Match object; span=(13, 14), match='m'>
<re.Match object; span=(14, 15), match='n'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(16, 17), match='p'>
<re.Match object; span=(17, 18), match='q'>
<re.Match object; span=(18, 19), match='u'>
<re.Match object; span=(19, 20), match='r'>
<re.Match object; span=(20, 21), match='t'>
<re.Match object; span=(21, 22), match='u'>
<re.Match object; span=(22, 23), match='v'>
<re.Match object; span=(23, 24), match='w'>
<re.M

### Specify range of values between a-z and A-Z

In [36]:
pattern = re.compile(r'[a-zA-Z]')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(1, 2), match='a'>
<re.Match object; span=(2, 3), match='b'>
<re.Match object; span=(3, 4), match='c'>
<re.Match object; span=(4, 5), match='d'>
<re.Match object; span=(5, 6), match='e'>
<re.Match object; span=(6, 7), match='f'>
<re.Match object; span=(7, 8), match='g'>
<re.Match object; span=(8, 9), match='h'>
<re.Match object; span=(9, 10), match='i'>
<re.Match object; span=(10, 11), match='j'>
<re.Match object; span=(11, 12), match='k'>
<re.Match object; span=(12, 13), match='l'>
<re.Match object; span=(13, 14), match='m'>
<re.Match object; span=(14, 15), match='n'>
<re.Match object; span=(15, 16), match='o'>
<re.Match object; span=(16, 17), match='p'>
<re.Match object; span=(17, 18), match='q'>
<re.Match object; span=(18, 19), match='u'>
<re.Match object; span=(19, 20), match='r'>
<re.Match object; span=(20, 21), match='t'>
<re.Match object; span=(21, 22), match='u'>
<re.Match object; span=(22, 23), match='v'>
<re.Match object; span=(23, 24), match='w'>
<re.M

### Specify opposite of character set

In [37]:
pattern = re.compile(r'[^a-zA-Z]')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(0, 1), match='\n'>
<re.Match object; span=(27, 28), match='\n'>
<re.Match object; span=(54, 55), match='\n'>
<re.Match object; span=(55, 56), match='1'>
<re.Match object; span=(56, 57), match='2'>
<re.Match object; span=(57, 58), match='3'>
<re.Match object; span=(58, 59), match='4'>
<re.Match object; span=(59, 60), match='5'>
<re.Match object; span=(60, 61), match='6'>
<re.Match object; span=(61, 62), match='7'>
<re.Match object; span=(62, 63), match='8'>
<re.Match object; span=(63, 64), match='9'>
<re.Match object; span=(64, 65), match='0'>
<re.Match object; span=(65, 66), match='\n'>
<re.Match object; span=(66, 67), match='\n'>
<re.Match object; span=(69, 70), match=' '>
<re.Match object; span=(74, 75), match='\n'>
<re.Match object; span=(75, 76), match='\n'>
<re.Match object; span=(90, 91), match=' '>
<re.Match object; span=(91, 92), match='('>
<re.Match object; span=(96, 97), match=' '>
<re.Match object; span=(99, 100), match=' '>
<re.Match object; span=(10

### Search for all words not starting with b

In [38]:
new_string = '''
cat
mat
pat
bat
'''

pattern = re.compile(r'[^b]at')
matches = re.findall(pattern, new_string)
matches

['cat', 'mat', 'pat']

In [39]:
matches = pattern.finditer(new_string)
for match in matches:
    print(match)

<re.Match object; span=(1, 4), match='cat'>
<re.Match object; span=(5, 8), match='mat'>
<re.Match object; span=(9, 12), match='pat'>


### Use exact number quantifiers when searching for phone numbers

In [40]:
pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d\d')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(155, 167), match='321-555-4321'>
<re.Match object; span=(168, 180), match='123.555.1234'>
<re.Match object; span=(181, 193), match='123*555*1234'>
<re.Match object; span=(194, 206), match='800-555-1234'>
<re.Match object; span=(207, 219), match='900-555-1234'>


In [41]:
pattern = re.compile(r'\d{3}.\d{3}.\d{4}')
matches = pattern.finditer(text_to_search)
for match in matches:
    print(match)

<re.Match object; span=(155, 167), match='321-555-4321'>
<re.Match object; span=(168, 180), match='123.555.1234'>
<re.Match object; span=(181, 193), match='123*555*1234'>
<re.Match object; span=(194, 206), match='800-555-1234'>
<re.Match object; span=(207, 219), match='900-555-1234'>


### Search for all Mr (last name)

In [42]:
new_string = '''
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

In [43]:
pattern = re.compile(r'Mr\.')
matches = re.finditer(pattern, new_string)
for match in matches:
    print(match)

<re.Match object; span=(1, 4), match='Mr.'>
<re.Match object; span=(45, 48), match='Mr.'>


In [44]:
pattern = re.compile(r'Mr\.?')
matches = re.finditer(pattern, new_string)
for match in matches:
    print(match)

<re.Match object; span=(1, 4), match='Mr.'>
<re.Match object; span=(13, 15), match='Mr'>
<re.Match object; span=(31, 33), match='Mr'>
<re.Match object; span=(45, 48), match='Mr.'>


In [45]:
pattern = re.compile(r'Mr\.?\s[A-Z]')
matches = re.finditer(pattern, new_string)
for match in matches:
    print(match)

<re.Match object; span=(1, 6), match='Mr. S'>
<re.Match object; span=(13, 17), match='Mr S'>
<re.Match object; span=(45, 50), match='Mr. T'>


In [46]:
pattern = re.compile(r'Mr\.?\s[A-Z]\w*')
matches = re.finditer(pattern, new_string)
for match in matches:
    print(match)

<re.Match object; span=(1, 12), match='Mr. Schafer'>
<re.Match object; span=(13, 21), match='Mr Smith'>
<re.Match object; span=(45, 50), match='Mr. T'>


- **Mr\.?** will find Mr. or Mr
- **\s** will find a space
- **[A\-Z]** will get a capital letter
- **\w*** will find a word or no word

### Search for a group of words or letters - Mr/Mrs/Ms

In [47]:
new_string = '''
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

In [48]:
pattern = re.compile(r'M(r|s|rs).?\s[A-Z]\w*')
matches = re.finditer(pattern, new_string)
for match in matches:
    print(match)

<re.Match object; span=(1, 12), match='Mr. Schafer'>
<re.Match object; span=(13, 21), match='Mr Smith'>
<re.Match object; span=(22, 30), match='Ms Davis'>
<re.Match object; span=(31, 44), match='Mrs. Robinson'>
<re.Match object; span=(45, 50), match='Mr. T'>


In [49]:
pattern = re.compile(r'(Mr|Ms|Mrs).?\s[A-Z]\w*')
matches = re.finditer(pattern, new_string)
for match in matches:
    print(match)

<re.Match object; span=(1, 12), match='Mr. Schafer'>
<re.Match object; span=(13, 21), match='Mr Smith'>
<re.Match object; span=(22, 30), match='Ms Davis'>
<re.Match object; span=(31, 44), match='Mrs. Robinson'>
<re.Match object; span=(45, 50), match='Mr. T'>


### Search for emails

In [50]:
emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
'''

In [51]:
pattern = re.compile(r'[a-zA-z]+@')
matches = re.finditer(pattern, emails)
for match in matches:
    print(match)

<re.Match object; span=(1, 15), match='CoreyMSchafer@'>
<re.Match object; span=(31, 39), match='schafer@'>
<re.Match object; span=(64, 72), match='schafer@'>


In [52]:
pattern = re.compile(r'[a-zA-z]+@[a-zA-Z]+\.com')
matches = re.finditer(pattern, emails)
for match in matches:
    print(match)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>


In [53]:
pattern = re.compile(r'[a-zA-z0-9.-]+@[a-zA-Z-]+\.(com|edu|net)')
matches = re.finditer(pattern, emails)
for match in matches:
    print(match)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
<re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


In [54]:
pattern = re.compile(r'[\w.-]+@[\w-]+\.(com|edu|net)')
matches = re.finditer(pattern, emails)
for match in matches:
    print(match)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
<re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


## Search for urls and use the match object

In [55]:
urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

In [56]:
pattern = re.compile(r'https')
matches = re.finditer(pattern, urls)
for match in matches:
    print(match)

<re.Match object; span=(1, 6), match='https'>
<re.Match object; span=(43, 48), match='https'>
<re.Match object; span=(63, 68), match='https'>


In [57]:
pattern = re.compile(r'https?')
matches = re.finditer(pattern, urls)
for match in matches:
    print(match)

<re.Match object; span=(1, 6), match='https'>
<re.Match object; span=(24, 28), match='http'>
<re.Match object; span=(43, 48), match='https'>
<re.Match object; span=(63, 68), match='https'>


In [58]:
pattern = re.compile(r'https?://')
matches = re.finditer(pattern, urls)
for match in matches:
    print(match)

<re.Match object; span=(1, 9), match='https://'>
<re.Match object; span=(24, 31), match='http://'>
<re.Match object; span=(43, 51), match='https://'>
<re.Match object; span=(63, 71), match='https://'>


In [59]:
pattern = re.compile(r'https?://(www\.)')
matches = re.finditer(pattern, urls)
for match in matches:
    print(match)

<re.Match object; span=(1, 13), match='https://www.'>
<re.Match object; span=(63, 75), match='https://www.'>


In [60]:
pattern = re.compile(r'https?://(www\.)?\w+\.\w+')
matches = re.finditer(pattern, urls)
for match in matches:
    print(match)

<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 42), match='http://coreyms.com'>
<re.Match object; span=(43, 62), match='https://youtube.com'>
<re.Match object; span=(63, 83), match='https://www.nasa.gov'>


### Use the match object

In [61]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = re.finditer(pattern, urls)
for match in matches:
    print(match)

<re.Match object; span=(1, 23), match='https://www.google.com'>
<re.Match object; span=(24, 42), match='http://coreyms.com'>
<re.Match object; span=(43, 62), match='https://youtube.com'>
<re.Match object; span=(63, 83), match='https://www.nasa.gov'>


In [62]:
## Specify everything in groups to use the group method
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = re.finditer(pattern, urls)
for match in matches:
    print(match.group(0))

https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov


In [63]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = re.finditer(pattern, urls)
for match in matches:
    print(match.group(1))

www.
None
None
www.


In [64]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = re.finditer(pattern, urls)
for match in matches:
    print(match.group(2))

google
coreyms
youtube
nasa


In [65]:
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)')
matches = re.finditer(pattern, urls)
for match in matches:
    print(match.group(3))

.com
.com
.com
.gov


### Add specific groups to a string

In [66]:
subbed_url = pattern.sub(r'\2', urls)
print(subbed_url)


google
coreyms
youtube
nasa



In [67]:
subbed_url = pattern.sub(r'\2\3', urls)
print(subbed_url)


google.com
coreyms.com
youtube.com
nasa.gov



## Use the findall / finditer / group methods
finditer returns extra info and functionality

findall will return matches as list of string

In [73]:
new_string = '''
Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T
'''

### finditer

In [81]:
pattern = re.compile(r'(Mr|Ms|Mrs).?\s[A-Z]\w*')
matches = re.finditer(pattern, new_string)
for match in matches:
    print(match)

<re.Match object; span=(1, 12), match='Mr. Schafer'>
<re.Match object; span=(13, 21), match='Mr Smith'>
<re.Match object; span=(22, 30), match='Ms Davis'>
<re.Match object; span=(31, 44), match='Mrs. Robinson'>
<re.Match object; span=(45, 50), match='Mr. T'>


### findall
If finding groups will only return the group

In [75]:
pattern = re.compile(r'(Mr|Ms|Mrs).?\s[A-Z]\w*')
matches = re.findall(pattern, new_string)
for match in matches:
    print(match)

Mr
Mr
Ms
Mrs
Mr


In [76]:
pattern = re.compile(r'(Mr|Ms|Mrs).?(\s[A-Z]\w*)')
matches = re.findall(pattern, new_string)
for match in matches:
    print(match)

('Mr', ' Schafer')
('Mr', ' Smith')
('Ms', ' Davis')
('Mrs', ' Robinson')
('Mr', ' T')


In [72]:
phone_numbers = '''
321-555-4321
123.555.1234
123*555*1234
800-555-1234
900-555-1234
'''

pattern = re.compile(r'\d{3}.\d{3}.\d{4}')
matches = re.findall(pattern, phone_numbers)

print(matches)

['321-555-4321', '123.555.1234', '123*555*1234', '800-555-1234', '900-555-1234']
