A regular expression (regex) is a sequence of characters that specifies a search pattern in text.

## Import packages

In [2]:
#All regex functions are incorporated into the "re" module.
import re

## Tutorial

In [4]:
# Initialize a regex object
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')  # \d is short for digit (0-9)

In [5]:
# The search() method searches the regex object for a string it is passed (returns None if not found).
mo = phoneNumRegex.search('My phone number is 408-659-5969.')  # mo is short for 'match object'
print('Phone number found: ' + mo.group())

Phone number found: 408-659-5969


## Brackets [ ]

In [22]:
# By putting characters in brackets [], a string can be searched to match any of the characters in the brackets. 
# (Case sentisitve.)

In [23]:
bracketRegex = re.compile(r'[aA]')
mo = bracketRegex.search('This is A sentence.')
print('Matched object found: ' + mo.group())

Matched object found: A


In [24]:
nonBracketRegex = re.compile(r'aA')
mo = nonBracketRegex.search('This is a sentence.')
print('Matched object found: ' + mo.group())

AttributeError: 'NoneType' object has no attribute 'group'

In [26]:
# Here the regex will search for matthew OR Matthew
fullWordBracket = re.compile(r'[mM]atthew')
mo = fullWordBracket.search('My name is matthew.')
print('Matched object found: ' + mo.group())

Matched object found: matthew


## Dashes (-) - Range of characters

In [31]:
# A range of characters/digits can be specified by using a dash (-)
dashRegex = re.compile(r'[0-9]')
mo = dashRegex.search('This is a number: 5.')
print('Matched object found: ' + mo.group())

dashRegex = re.compile(r'[a-z]')
mo = dashRegex.search('THIS IS A CHARACTER: f.')
print('Matched object found: ' + mo.group())

dashRegex = re.compile(r'[aA-zZ]')
mo = dashRegex.search('This is a character: f.')
print('Matched object found: ' + mo.group())

Matched object found: 5
Matched object found: f
Matched object found: T


## Carats (^) - Exclusions

In [37]:
# Excluding characters is done by using the carat (^). Note that the carat must be at the front.
caratRegex = re.compile(r'[^A-Z]')
mo = caratRegex.search('This is a number: 5.')
print('Matched object found: ' + mo.group())

Matched object found: h


## ( ? ) - Optional characters (One or none)

In [43]:
# To optionally include a character, a (?) must follow it.
optionalRegex = re.compile(r'colou?r')
mo = optionalRegex.search('This is how to spell color.')
print('Matched object found: ' + mo.group())

Matched object found: color


## ( * ) - Optional characters (One or more)

In [51]:
# To optionally allow repeating characters, a (*) must follow it.
repeatingRegex = re.compile(r'Ba*')
mo = repeatingRegex.search('A sheep says: Baaaa!')
print('Matched object found: ' + mo.group())

# This will match aaaaaa or ababab or bbbbbb (fix this)
repeatingRegex = re.compile(r'[ab]*')
mo = repeatingRegex.search('A sheep says: ababab')
print('Matched object found: ' + mo.group())

# Repeating digits
repeatingDigitsRegex = re.compile(r'[0-9][0-9]*')
mo = repeatingDigitsRegex.search('My phone number is: 4086595969')
print('Matched object found: ' + mo.group())

Matched object found: Baaaa
Matched object found: 
Matched object found: 4086595969


## (+) - Optional characters (One or more)

In [54]:
# Alternatively, use the (+)
repeatingDigitsRegex = re.compile(r'[0-9]+')
mo = repeatingDigitsRegex.search('My phone number is: 4086595969')
print('Matched object found: ' + mo.group())

Matched object found: 4086595969


Note: oo*h = o+h  matches oh, ooh, oooh...

## ( . ) - Wildcards 

In [55]:
# To use a period normally, a backslash must proceed it. (\.)
wildcardRegex = re.compile(r'.atthew')
mo = wildcardRegex.search('My name is matthew.')
print('Matched object found: ' + mo.group())

Matched object found: matthew


## Anchors and boundaries

In [4]:
# Anchors check for matching text at the start(^) or end($) of a string.
anchorRegex = re.compile(r'^My')
mo = anchorRegex.search('My name is matthew.')
print('Matched object found: ' + mo.group())

anchorRegex = re.compile(r'matthew\.$')
mo = anchorRegex.search('My name is matthew.')
print('Matched object found: ' + mo.group())

# The \b (\B) characters set set boundaries. (Note: \b won't treat numbers and underscores as boundaries.)
boundaryRegex = re.compile(r'\bthe\b')
mo = boundaryRegex.search('the other one.')
print('Matched object found: ' + mo.group())

Matched object found: My
Matched object found: matthew.
Matched object found: the


## ( | ) - Disjunction  (OR)

In [2]:
# The pipe operator (OR) checks for either phrase.
pipeRegex = re.compile(r'cat|dog')
mo = pipeRegex.search('I own a cat and a dog.')
print('Matched object found: ' + mo.group())

Matched object found: cat


Note that a | b | c = [abc]

## ( ) - Disjunction / Precendence  (AND)

In [3]:
andRegex = re.compile(r'gupp(y|ies)')
mo = andRegex.search('Baby fish are called guppies')
print('Matched object found: ' + mo.group())

Matched object found: guppies


## Greedy and non-greedy (+?)

In [13]:
# By default RE will search for the maximum number of characters (greedy).
greedyRegex = re.compile(r'[a-z]*')
mo = greedyRegex.search('once upon a time')
print('Matched object found: ' + mo.group())

# But we can specify the search to be *non-greedy* (+?) to use the smallest number of characters.
nongreedyRegex = re.compile(r'[a-z]+?')
mo = nongreedyRegex.search('once upon a time')
print('Matched object found: ' + mo.group())

Matched object found: once
Matched object found: o


## { } - Ranges

In [30]:
# Rather than using the Kleene+ to specify one or more, you can specify ranges of allowed repititions. 
braceRegex = re.compile(r'[0-9]{1,3}')
mo = braceRegex.search('There are 9999 bottles of beer on the wall.')
print('Matched object found: ' + mo.group())

Matched object found: 999


# Sandbox

In [25]:
myRegex = re.compile(r'(^|\W)\$[0-9]{0,3}(\.[0-9][0-9])?\b')
mo = myRegex.search('A computer costs $99.66')
print('Matched object found: ' + mo.group())

Matched object found:  $99.66


## List of regex character classes

\d  - digit  
\s  - whitespace (space, tab, newline)  
\w  - alphanumeric  
\\(  - escape characters (. ^ $ * + ? { } [ ] \\ | ( ) )  
(capitalized = non-*)

## Other notes

Reducing the error rate for an application often involves to antagonistic efforts:  
*Increasing accuracy or precision (minimizing false positives)  
*Increasing coverage or recall (minimizing false negatives)  