In [1]:
# find an American phone number in a string
# three numbers, a hyphen, three numbers, a hyphen, and four numbers
# example: 415-555-4242
def isPhoneNumber(text):
    if len(text) != 12:
        return False
    for i in range(0, 3):
        if not text[i].isdecimal():
            return False
    if text[3] != '-':
        return False
    for i in range(4, 7):
        if not text[i].isdecimal():
            return False
    if text[7] != '-':
        return False
    for i in range(8, 12):
        if not text[i].isdecimal():
            return False
    return True

print('Is 415-555-4242 a phone number?')
print(isPhoneNumber('415-555-4242'))
print('Is Moshi moshi a phone number?')
print(isPhoneNumber('Moshi moshi'))

Is 415-555-4242 a phone number?
True
Is Moshi moshi a phone number?
False


In [2]:
message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'
for i in range(len(message)):
    chunk = message[i:i+12]
    if isPhoneNumber(chunk):
        print('Phone number found: ' + chunk)
print('Done')

Phone number found: 415-555-1011
Phone number found: 415-555-9999
Done


In [8]:
import re
text = 'My number is 415-555-4247.'
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
matchObjects = phoneNumRegex.search(text)
print('Phone number found: ' + matchObjects.group())

Phone number found: 415-555-4247


In [9]:
# to test a regular expression, visit :
# https://pythex.org/

In [13]:
# Grouping with Parentheses
import re
text = 'My number is 415-555-4247.'
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
matchObjects = phoneNumRegex.search(text)
print(matchObjects.group(1))
print(matchObjects.group(2))
print(matchObjects.group(0))
print(matchObjects.group())
print(matchObjects.groups())
areaCode, mainNumber = matchObjects.groups()
print(areaCode)
print(mainNumber)

415
555-4247
415-555-4247
415-555-4247
('415', '555-4247')
415
555-4247


In [14]:
text = 'My phone number is (415) 555-4242.'
phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')
matchObjects = phoneNumRegex.search(text)
print(matchObjects.group(1))
print(matchObjects.group(2))

(415)
555-4242


In [17]:
# Matching Multiple Groups with the Pipe |
heroRegex = re.compile (r'Batman|Tina Fey')
mo1 = heroRegex.search('Batman and Tina Fey')
print(mo1.group())
mo2 = heroRegex.search('Tina Fey and Batman')
print(mo2.group())

Batman
Tina Fey


In [19]:
text = 'Batmobile lost a wheel'
batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo = batRegex.search(text)
print(mo.group())
print(mo.group(1))

Batmobile
mobile


In [20]:
# Optional Matching with the Question Mark ?
batRegex = re.compile(r'Bat(wo)?man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1.group())
mo2 = batRegex.search('The Adventures of Batwoman')
print(mo2.group())

Batman
Batwoman


In [21]:
# Matching Zero or More with the Star *
batRegex = re.compile(r'Bat(wo)*man')
mo1 = batRegex.search('The Adventures of Batman')
print(mo1.group())
mo2 = batRegex.search('The Adventures of Batwoman')
print(mo2.group())
mo3 = batRegex.search('The Adventures of Batwowowowoman')
print(mo3.group())

Batman
Batwoman
Batwowowowoman


In [24]:
# Matching One or More with the Plus +
batRegex = re.compile(r'Bat(wo)+man')
mo1 = batRegex.search('The Adventures of Batwoman')
print(mo1.group())
mo2 = batRegex.search('The Adventures of Batwowowowoman')
print(mo2.group())
mo3 = batRegex.search('The Adventures of Batman')
mo3 == None

Batwoman
Batwowowowoman


True

In [25]:
# Matching Specific Repetitions with Braces
haRegex = re.compile(r'(Ha){3}')
mo1 = haRegex.search('HaHaHa')
print(mo1.group())
mo2 = haRegex.search('Ha')
mo2 == None

HaHaHa


True

In [28]:
# Greedy and Non-greedy Matching
# Greedy (by default) returns the longest string possible
greedyHaRegex = re.compile(r'(Ha){3,5}')
mo1 = greedyHaRegex.search('HaHaHaHaHa')
print(mo1.group())

# non-Greedy returns the shortest string possible
nongreedyHaRegex = re.compile(r'(Ha){3,5}?')
mo2 = nongreedyHaRegex.search('HaHaHaHaHa')
print(mo2.group())

HaHaHaHaHa
HaHaHa


In [31]:
# findall()
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # has no groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

['415-555-9999', '212-555-0000']

In [32]:
# findall()
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)') # has groups
phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000')

[('415', '555', '9999'), ('212', '555', '0000')]

In [35]:
# \d+\s\w+ will match text that has one or more numeric digits (\d+), followed by a whitespace character (\s),
# followed by one or more letter/digit/underscore characters (\w+)
xmasRegex = re.compile(r'\d+\s\w+')
xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

In [36]:
# Making Your Own Character Classes
vowelRegex = re.compile(r'[aeiouyAEIOUY]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'y', 'o', 'o', 'A', 'Y', 'O', 'O']

In [38]:
# Making Your Own Character Classes
vowelRegex = re.compile(r'[a-zA-Z0-9]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'o',
 'b',
 'o',
 'C',
 'o',
 'p',
 'e',
 'a',
 't',
 's',
 'b',
 'a',
 'b',
 'y',
 'f',
 'o',
 'o',
 'd',
 'B',
 'A',
 'B',
 'Y',
 'F',
 'O',
 'O',
 'D']

In [40]:
# Making Your Own Character Classes
# a caret character (^) just after the character class’s opening bracket, you can make a negative character class
consonantRegex = re.compile(r'[^aeiouyAEIOUY]')
consonantRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 ' ',
 'F',
 'D',
 '.']

In [41]:
# Caret and Dollar
beginsWithHello = re.compile(r'^Hello')
beginsWithHello.search('Hello, world!')

<re.Match object; span=(0, 5), match='Hello'>

In [42]:
beginsWithHello.search('He said hello.') == None

True

In [45]:
endsWithNumber = re.compile(r'\d$')
endsWithNumber.search('Your number is 42')

<re.Match object; span=(16, 17), match='2'>

In [46]:
endsWithNumber.search('Your number is forty two.') == None

True

In [48]:
# both begin and end with one or more numeric characters
wholeStringIsNum = re.compile(r'^\d+$')
wholeStringIsNum.search('1234567890')

<re.Match object; span=(0, 10), match='1234567890'>

In [52]:
wholeStringIsNum = re.compile(r'^\d+$')
wholeStringIsNum.search('12345xyz67890') == True

False

In [53]:
wholeStringIsNum = re.compile(r'^\d+$')
wholeStringIsNum.search('12  34567890') == True

False

In [57]:
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

In [58]:
# Matching Everything with Dot-Star .* (greedy mode = max text)
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
mo = nameRegex.search('First Name: Al Last Name: Sweigart')
print(mo.group(1))
print(mo.group(2))

Al
Sweigart


In [59]:
# Matching Everything with Dot-Star-question-mark .*? (non greedy mode = min text)
nongreedyRegex = re.compile(r'<.*?>')
mo = nongreedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

<To serve man>


In [60]:
# Matching Everything with Dot-Star .* (greedy mode = max text)
greedyRegex = re.compile(r'<.*>')
mo = greedyRegex.search('<To serve man> for dinner.>')
print(mo.group())

<To serve man> for dinner.>


In [61]:
# Matching No-Newlines with the Dot Character
noNewlineRegex = re.compile('.*')
noNewlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.'

In [62]:
# Matching Newlines with the Dot Character
newlineRegex = re.compile('.*', re.DOTALL)
newlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.\nProtect the innocent.\nUphold the law.'

In [67]:
# Case-Insensitive Matching
robocop = re.compile(r'robocop', re.IGNORECASE)
# abréviation possible :
# robocop = re.compile(r'robocop', re.I)
print(robocop.search('RoboCop is part man, part machine, all cop.').group())
print(robocop.search('ROBOCOP protects the innocent.').group())
print(robocop.search('Al, why does your programming book talk about robocop so much?').group())

RoboCop
ROBOCOP
robocop


In [68]:
# Substituting Strings with sub()
namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [72]:
# Substituting Strings with sub()
agentNamesRegex = re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'

In [73]:
# Managing Complex Regexes
phoneRegex = re.compile(r'''(
    (\d{3}|\(\d{3}\))?            # area code
    (\s|-|\.)?                    # separator
    \d{3}                         # first 3 digits
    (\s|-|\.)                     # separator
    \d{4}                         # last 4 digits
    (\s*(ext|x|ext.)\s*\d{2,5})?  # extension
    )''', re.VERBOSE)

In [74]:
# regular expression that’s case-insensitive and includes newlines to match the dot character
someRegexValue = re.compile('foo', re.IGNORECASE | re.DOTALL | re.VERBOSE)