# Regular Expressions

In [14]:
import re
source = "To be or not to be, that is a question"
pattern = "To be"

result = re.match(pattern, source)
print(result)

<_sre.SRE_Match object; span=(0, 5), match='To be'>


In [2]:
compiled_pattern = re.compile("To be")

In [3]:
result_compiled = compiled_pattern.match(source)
print(result_compiled)

<_sre.SRE_Match object; span=(0, 5), match='To be'>


## Exact match with match()

In [4]:
import re
source = "To be or not to be, that is a question."

compiled_pattern = re.compile("To be")
m = compiled_pattern.match(source)

if m:
    print(m.group())

To be


#### .group() prints what matches

In [5]:
middle_pattern = re.compile("that is")
m = middle_pattern.match(source)

if m:
    print(m.group())

In [6]:
middle_pattern_with_wildcard = re.compile(".*that is")
m = middle_pattern_with_wildcard.match(source)

if m:
    print(m.group())

To be or not to be, that is


#### .* = any number of any characters, even 0 characters

.* = "To be or not to be,"

## First match with search

In [7]:
middle_pattern = re.compile("that is")
m = middle_pattern.search(source)

if m:
    print(m.group())

that is


## All matches with findall()

In [9]:
n_pattern = re.compile("n")
m = n_pattern.findall(source)

print("Found", len(m), "matches.")
print(m)

Found 2 matches.
['n', 'n']


In [15]:
n_and_character_pattern = re.compile("n.")
m = n_and_character_pattern.findall(source)

print("Found", len(m), "matches.")
print(m)

Found 1 matches.
['no']


In [17]:
n_and_character_optional_pattern = re.compile("n.?")
m = n_and_character_optional_pattern.findall(source)

print("Found", len(m), "matches.")
print(m)

Found 2 matches.
['no', 'n']


## Split at matches with split()

In [18]:
n_pattern = re.compile("n")
m = n_pattern.split(source)
print(m)

['To be or ', 'ot to be, that is a questio', '']


## Replace at matches with sub()

In [19]:
n_pattern = re.compile("n")
m = n_pattern.sub("?", source)
print(m)

To be or ?ot to be, that is a questio?


# Defining Patterns

In [20]:
import string
printable = string.printable
print(printable)

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	



In [21]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [23]:
re.findall("\d", printable)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [27]:
re.findall("\w", printable)

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '_']

In [29]:
re.findall("\s", printable)

[' ', '\t', '\n', '\r', '\x0b', '\x0c']

## Using Specifiers

"*" . ?

Look at the book!

In [31]:
large_source = """
Hi Bianca,

it was nice to talk to you blah blah it was nice to talk to you blah blah
it was nice to talk to you blah blah it was nice to talk to you blah blah
it was nice to talk to you blah blah it was nice to talk to you blah blah
My number is 650-555-3948. thanks!

-Mary
"""

In [34]:
phone_number_pattern = re.compile(r'[0123456789]{3}-[0123456789]{3}-[0123456789]{4}')
m = phone_number_pattern.findall(large_source)
print(m)

['650-555-3948']


In [43]:
phone_number_pattern = re.compile(r'\d{3}-\d{3}-\d{4}')
m = phone_number_pattern.findall(large_source)
print(m)

['650-555-3948']


In [50]:
phone_number_pattern = re.compile(r'(\d{3})(\d{3})(\d{4})')
m = phone_number_pattern.search(large_source)
print(m.groups())

AttributeError: 'NoneType' object has no attribute 'groups'

## Specifying match output

In [48]:
phone_number_pattern = re.compile(r'(\d{3})-(\d{3}-\d{4})')
m = phone_number_pattern.search(large_source)

if m:
    print(m.group())
    print(m.groups())

650-555-3948
('650', '555-3948')


In [46]:
phone_number_pattern = re.compile(r'(?P<areacode>\d{3})-(?P<number>\d{3}-\d{4})')
m = phone_number_pattern.search(large_source)

if m:
    print(m.group("areacode"))
    print(m.group("number"))

650
555-3948
