# Pattern Matching With Regular Expressions

## Finding Patterns of Text Without Regular Expressions

In [1]:
# isPhoneNumber.py
def isPhoneNumber(text):
    if len(text) != 12:
        return False
    for i in range(0, 3):
        if not text[i].isdecimal():
            return False
    if text[3] != '-':
        return False
    for i in range(4, 7):
        if not text[i].isdecimal():
            return False
    if text[7] != '-':
        return False
    for i in range(8, 12):
        if not text[i].isdecimal():
            return False
    return True

# print('Is 415-555-4242 a phone number?')
# print(isPhoneNumber('415-555-4242'))
# print('Is Moshi moshi a phone number?')
# print(isPhoneNumber('Moshi moshi'))

message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'
for i in range(len(message)):
    chunk = message[i:i+12]
    if isPhoneNumber(chunk):
        print('Phone number found: ' + chunk)
print('Done')

Phone number found: 415-555-1011
Phone number found: 415-555-9999
Done


## Finding Patterns of Text With Regular Expressions

In [2]:
import re
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

### Matching Regex Objects

In [3]:
import re
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search('My number is 415-555-4242.')
print('Phone number found: ' + mo.group())

Phone number found: 415-555-4242


### Review of Regular Expression Matching
1. Import the regex module with import re.
2. Create a Regex object with the re.compile() function. (Remember to use a raw string.)
3. Pass the stirng you want to search into the Regex object's search() method. This returns a Match object.
4. Call the Match object's group() method to return a string of the actual matched text.

## More Pattern Matching with Regular Expressions

### Grouping with Parentheses

In [4]:
import re
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My number is 415-555-4242.')
print(mo.group(1))
print(mo.group(2))
print(mo.group(0))
print(mo.group())

# retrieve all the groups
print(mo.groups())
# multiple-assignment trick
areaCode, mainNumber = mo.groups()
print(areaCode)
print(mainNumber)

415
555-4242
415-555-4242
415-555-4242
('415', '555-4242')
415
555-4242


In [5]:
import re
phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')
mo = phoneNumRegex.search('My phone number is (415) 555-4242.')
print(mo.group(1))
print(mo.group(2))

(415)
555-4242


In [6]:
import re
# re.compile(r'(\(Parentheses\)') error: missing ), unterminated subpattern at position 0

### Matching Multiple Groups with the Pipe

In [7]:
# the first occurrence of matching text will be returned as the Match object.
import re
heroRegex = re.compile(r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey')
print(mo1.group())

mo2 = heroRegex.search('Tina Fey and Batman')
print(mo2.group())

Batman
Tina Fey


In [8]:
import re
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search('Batmobile lost a wheel')
print(mo.group())
print(mo.group(1))

Batmobile
mobile


### Optional Matching with the Question Mark

In [9]:
import re
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1.group())

mo2 = batRegex.search('The Adventures of Batwoman')
print(mo2.group())

Batman
Batwoman


In [10]:
import re
phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
mo1 = phoneRegex.search('My number is 415-555-4242')
print(mo1.group())

mo2 = phoneRegex.search('My number is 555-4242')
print(mo2.group())

415-555-4242
555-4242


### Matching Zero or More with the Star

In [11]:
import re
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1.group())

mo2 = batRegex.search('The Adventures of Batwoman')
print(mo2.group())

mo3 = batRegex.search('The Adventures of Batwowowoman')
print(mo3.group())

Batman
Batwoman
Batwowowoman


### Matching One or More with the Plus

In [12]:
import re
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batwoman')
print(mo1.group())

mo2 = batRegex.search('The Adventures of Batwowowowoman')
print(mo2.group())

mo3 = batRegex.search('The Adventures of Batman')
print(mo3 == None)

Batwoman
Batwowowowoman
True


### Matching Specific Repetitions with Braces

In [13]:
# (Ha){3} == (Ha)(Ha)(Ha)
# (Ha){3,5} == ((Ha)(Ha)(Ha))|((Ha)(Ha)(Ha)(Ha))|((Ha)(Ha)(Ha)(Ha)(Ha))
import re
haRegex = re.compile(r'(Ha){3}')
mo1 = haRegex.search('HaHaHa')
print(mo1.group())

mo2 = haRegex.search('Ha')
print(mo2 == None)

HaHaHa
True


## Greedy and Non-greedy Matching

In [14]:
import re
greedyHaRegex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegex.search('HaHaHaHaHa')
print(mo1.group())

nongreedyHaRagex = re.compile(r'(Ha){3,5}?')
mo2 = nongreedyHaRagex.search('HaHaHaHaHa')
print(mo2.group())

"""
The question mark has two meanings in regular expressions:
1. declaring a non-greedy match
2. flagging an optional group
"""

HaHaHaHaHa
HaHaHa


'\nThe question mark has two meanings in regular expressions:\n1. declaring a non-greedy match\n2. flagging an optional group\n'

## The findall() Method

In [15]:
import re
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
# search() returns a Match object only on the first instance of matching text
mo = phoneNumRegex.search('Cell: 415-555-9999 Work: 212-555-0000')
mo.group()

'415-555-9999'

In [16]:
import re
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

['415-555-9999', '212-555-0000']

In [17]:
import re
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

[('415', '555', '9999'), ('212', '555', '0000')]

In [18]:
"""
To summarize what the findall() method returns.
1. When called on a regex with no groups, the findall() returns
    a list of string matches.
2. When called on a regex that has groups, the findall() returns
    a list of tuples of strings.
"""

'\nTo summarize what the findall() method returns.\n1. When called on a regex with no groups, the findall() returns\n    a list of string matches.\n2. When called on a regex that has groups, the findall() returns\n    a list of tuples of strings.\n'

## Character Classes

In [19]:
import re
xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 \
    swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

## Making Your Own Character Classes

In [20]:
import re
vowelRegex = re.compile(r'[aeiouAEIOU]')
print(vowelRegex.findall('RoboCop eats baby food. BABY FOOD.'))

consonantRegex = re.compile(r'[^aeiouAEIOU]')
consonantRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']


['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

## The Caret and Dollar Sign Characters

In [21]:
import re
beginsWithHello = re.compile(r'^Hello')
print(beginsWithHello.search('Hello, world!'))
print(beginsWithHello.search('He said Hello.'))

<re.Match object; span=(0, 5), match='Hello'>
None


In [22]:
import re
endsWithNumber = re.compile(r'\d$')
print(endsWithNumber.search('Your number is 42'))
print(endsWithNumber.search('Your number is forty two.') == None)

<re.Match object; span=(16, 17), match='2'>
True


In [23]:
import re
wholeStringIsNum = re.compile(r'^\d+$')
print(wholeStringIsNum.search('1234567890'))
print(wholeStringIsNum.search('123456xyz7890') == None)
print(wholeStringIsNum.search('12   34567890') == None)

<re.Match object; span=(0, 10), match='1234567890'>
True
True


## The Wildcard Character

In [24]:
import re
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

### Matching Everything with Dot-Star

In [25]:
import re
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Al Last Name: Sweigart')
print(mo.group(1))
print(mo.group(2))

Al
Sweigart


In [26]:
import re
nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

<To serve man>
<To serve man> for dinner.>


### Matching Newlines with the Dot Character

In [27]:
import re
noNewlineRegex = re.compile('.*')
print(noNewlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group())

print()

newlineRegex = re.compile('.*', re.DOTALL)
print(newlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group())

Serve the public trust.

Serve the public trust.
Protect the innocent.
Uphold the law.


## Review of Regex Symbols
1. The ? matches zero or one of the preceding group.
1. The * matches zero or more of the preceding group.
1. The + matches one or more of the preceding group.
1. The {n} matches exactly n of the preceding group.
1. The {n,} matches n or more of the preceding group.
1. The {,m} matches 0 to m of the preceding group.
1. The {n,m} matches at least n and at most m of the preceding group.
1. {n,m}? or *? or +? performs a non-greedy match of the preceding group.
1. ^spam means the string must begin with spam.
1. spam$ means the string must end with spam.
1. The . matches any character, except newline characters.
1. \d, \w, \s match a digit, word, or space character, respectively.
1. \D, \W, \S match anything except a digit, word, or space character, respectively.
1. [abc] matches any character between the brackets (such as a, b, or c).
1. [^abc] matches any character that isn't between the brackets.

## Case-Insensitive Matching

In [28]:
import re
regex1 = re.compile('RoboCop')
regex2 = re.compile('ROBOCOP')
regex3 = re.compile('robOcop')
regex4 = re.compile('RobocOp')

In [29]:
import re
robocop = re.compile(r'robocop', re.I)
print(robocop.search('RoboCop is part man, part machine, all cop.').group())
print(robocop.search('ROBOCOP protects the innocent.').group())
print(robocop.search('Al, why does your programming book talk about robocop so mush?').group())

RoboCop
ROBOCOP
robocop


## Substituting Strings with the sub() Method

In [30]:
import re
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [31]:
import re
agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'

## Managing Complex Regexes

In [32]:
import re
phoneRegex = re.compile(r'''(
    (\d{3}|\(\d{3}\))?           # area code
    (\s|-|\.)?                   # separator
    \d{3}                        # first 3 digits
    (\s|-|\.)?                   # separator
    \d{4}                        # last 4 digits
    (\s*(ext|x|ext.)\s*\d{2,5})? # extension
)''', re.VERBOSE)

## Combining re.IGNORECASE, re.DOTALL, and re.VERBOSE

In [33]:
import re
someRegexValue = re.compile('foo', re.IGNORECASE | re.DOTALL)
someRegexValue = re.compile('foo', re.IGNORECASE | re.DOTALL | re.VERBOSE)

## Project: Phone Number and Email Address Extractor
### Steps:
1. Get the text off the clipboard.
1. Find all phone numbers and email addresses in the text.
1. Paste them onto the clipboard.
### Coding steps:
1. Use the pyperclip module to copy and paste strings.
1. Create two regexes, one for matching phone numbers and the other for matching email address.
1. Find all matches, not just the first match, of both regexes.
1. Neatly format the matched strings into a sing string to paste.
1. Display some kind of message if no matches were found in the text.

In [34]:
# phoneAndEmail.py
import pyperclip, re

phoneRegex = re.compile(r'''(
    (\d{3}|\(\d{3}\))?              # area code
    (\s|-|\.)?                      # separator
    (\d{3})                         # first 3 digits
    (\s|-|\.)?                      # separator
    (\d{4})                         # first 4 digits
    (\s*(ext|x|ext.)\s*(\d{2,5}))?  # extension
)''', re.VERBOSE)

emailRegex = re.compile(r'''(
    [a-zA-Z0-9._%+-]+ # username
    @                 # @ symbol
    [a-zA-Z0-9._]+    # domain name
    (\.[a-zA-Z]{2,4}) # dot-something
)''', re.VERBOSE)

# Find matches in clipboard text.
text = pyperclip.paste()

matches = []
for groups in phoneRegex.findall(text):
    phoneNum = '-'.join([groups[1], groups[3], groups[5]])
    if groups[8] != '':
        phoneNum += ' x' + groups[8]
    matches.append(phoneNum)
for groups in emailRegex.findall(text):
    matches.append(groups[0])
                         
# Copy results to the clipboard.
if len(matches) > 0:
    pyperclip.copy('\n'.join(matches))
    print('Copied to clipboard:')
    print('\n'.join(matches))
else:
    print('No phone numbers or email addresses found.')

No phone numbers or email addresses found.
