## Regular expressions

In [115]:
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
np.random.seed(12345)
plt.rc('figure', figsize=(10, 6))
from pandas import Series, DataFrame
import pandas
import pandas as pd
np.set_printoptions(precision=4, threshold=500)
pd.options.display.max_rows = 100

In [116]:
%matplotlib inline

In [117]:
import re

### Metacharacters

In [118]:
# Metacharacters
# [ ]  - Specify a character class, which is a set of characters to match
#        [abc] will match any of the characters a, b, or c
#        [a-c] will match any of the characters a, b, or c
#        [a-z] will match only lowercase letters
#        Metacharacters are not active inside classes. [akm$] will match any of the characters 'a', 'k', 'm', or '$'
#        [^5] will match any character except '5'. '^' should be first character of the class. Not special outside the class
# \    - Used to escape other metacharacters and to signal special sequences when it's followed by other characters
#        \d matches any decimal digit. Equivalent to the class [0-9]
#        \D matches any non-digit character. Equivalent to the class [^0-9]
#        \s matches any whitespace character. Equivalent to the class [ \t\n\r\f\v]
#        \S matches any non-whitespace character. Equivalent to the class [^ \t\n\r\f\v]
#        \w matches any alphanumeric character. Equivalent to the class [a-zA-Z0-9_]
#        \W matches any non-alphanumeric character. Equivalent to the class [^a-zA-Z0-9_]
# .    - matches anything except a newline character. Used as a wildcard to match “any character”
# *    - for repeating things. It specifies that the previous character can be matched zero or more times
#        ca*t will match 'ct' (0 'a' characters), 'cat' (1 'a'), 'caaat' (3 'a' characters)
#        Expression a[bcd]*b matches the letter 'a', zero or more letters from the class [bcd], and finally ends with a 'b'
# +    - for repeating things. It specifies that the previous character can be matched at least once or more times
#        ca+t will match 'cat' (1 'a'), 'caaat' (3 'a's), but won’t match 'ct'
# ?    - for repeating things. It matches either once or zero times. Like marking something as being optional
#        home-?brew matches either 'homebrew' or 'home-brew'
# {m,n}- There must be at least m repetitions, and at most n
#        a/{1,3}b will match 'a/b', 'a//b', and 'a///b'. It won’t match 'ab', which has no slashes, or 'a////b', which has 4
#        {0,} is the same as *
#        {1,} is equivalent to +
#        {0,1} is the same as ?
# |    - bitwise OR 
# ^    - Matches at the beginning of lines. In MULTILINE mode it also matches immediately after each newline within the string
#        \A is equivalent to ^ without MULTILINE
# $    - Matches at the end of a line
#        \Z matches only at the end of the string
# \b   - Word boundary. This is a zero-width assertion that matches only at the beginning or end of a word 
# \B   - Opposite of \b, only matching when the current position is not at a word boundary
# ()   - Groups are marked by the '(', ')'. They group together the expressions contained inside them


### Compilation

In [119]:
p = re.compile('ab*')
p = re.compile('ab*', re.IGNORECASE) # Perform case-insensitive matching. [A-Z] will match lowercase letters
p

re.compile(r'ab*', re.IGNORECASE|re.UNICODE)

### Performing matches - findall()

In [120]:
# findall() - Find all substrings where the RE matches, and returns them as a list

text = "foo    bar\t baz  \tqux"
regex = re.compile('\s+') # compile regex

regex.findall(text) # get list of all patterns matching one or more whitespace

['    ', '\t ', '  \t']

In [121]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}' # capable of identify most email addresses

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

regex.findall(text) # get a list of all patterns matching the email addresses format

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [144]:
# segment email in its 3 components
# put parenthesis around parts of the pattern to segment
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

regex.findall(text) # return a list of tuples with the segments

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [245]:
p = re.compile('\d+')
p.findall('12 drummers drumming, 11 pipers piping, 10 lords a-leaping')

['12', '11', '10']

### Performing matches - search()

In [202]:
# search() - Scan through a string, looking for any location where this RE matches

# segment email in its 3 components
# put parenthesis around parts of the pattern to segment
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

m = regex.search(text) # gets only the first item in a special match object
m

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

In [203]:
text[m.start():m.end()] # extract email from special match object above

'dave@google.com'

In [242]:
m = p.search('::: message')
m

<_sre.SRE_Match object; span=(4, 11), match='message'>

In [243]:
m.group()

'message'

In [244]:
m.span()

(4, 11)

In [258]:
print(re.search('^From', 'From Here to Eternity'))   # matches at the beginning of line

<_sre.SRE_Match object; span=(0, 4), match='From'>


In [259]:
print(re.search('^From', 'Reciting From Memory'))

None


In [260]:
print(re.search('}$', '{block}'))   # matches at the end of line

<_sre.SRE_Match object; span=(6, 7), match='}'>


In [261]:
print(re.search('}$', '{block} '))

None


In [262]:
>>> print(re.search('}$', '{block}\n'))

<_sre.SRE_Match object; span=(6, 7), match='}'>


### Performing matches - match()

In [204]:
# match() -  	Determine if the RE matches at the beginning of the string

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

# segment email in its 3 components
# put parenthesis around parts of the pattern to segment
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

print(regex.match(text)) # returns none since match only finds the item at the start of the string

None


In [205]:
m = regex.match('wesm@bright.net') # create special match object
m.groups()

('wesm', 'bright', 'net')

In [206]:
m = re.match(r"(\d+)\.(\d+)", "24.1632")
m.groups()

('24', '1632')

In [207]:
m = re.match(r"(\d+)\.?(\d+)?", "24")
m.groups()      # Second group defaults to None.

('24', None)

In [208]:
m.groups('0')   # Now, the second group defaults to '0'.

('24', '0')

In [153]:
# add names to regex so a dict can be created instead of tuples
regex = re.compile(r"""
    (?P<username>[A-Z0-9._%+-]+)
    @
    (?P<domain>[A-Z0-9.-]+)
    \.
    (?P<suffix>[A-Z]{2,4})""", flags=re.IGNORECASE|re.VERBOSE)

m = regex.match('wesm@bright.net')
m.groupdict() # use groupdict instead of groups to create a dictionary

{'domain': 'bright', 'suffix': 'net', 'username': 'wesm'}

In [209]:
m = re.match(r"(?P<first_name>\w+) (?P<last_name>\w+)", "Malcolm Reynolds")
m.groupdict()

{'first_name': 'Malcolm', 'last_name': 'Reynolds'}

In [217]:
text = ""
p = re.compile('[a-z]+') # match any lowercase letters at least once
print(p.match(text))

None


In [218]:
# check if there is a match

match = p.search(text) # can use "match" as well
if match:
    print("matches")
else:
    print("does not match")    

does not match


In [222]:
text = 'tempo'
m = p.match(text)
m

<_sre.SRE_Match object; span=(0, 5), match='tempo'>

In [223]:
m.start(), m.end()

(0, 5)

In [224]:
m.span()

(0, 5)

In [225]:
text[m.start():m.end()]

'tempo'

In [226]:
m.group()

'tempo'

In [227]:
m.pos               # Index into the string at which the RE engine started looking for a match

0

In [234]:
m.endpos            # Index into the string beyond which the RE engine will not go

5

In [237]:
print(m.lastindex)  # Integer index of the last matched capturing group, or None if no group was matched at all

None


In [238]:
print(m.lastgroup)# Name of the last matched capturing group, or None if group didn’t have a name, or if no group was matched 

None


In [239]:
m.re           # The regular expression object whose match() or search() method produced this match instance

re.compile(r'[a-z]+', re.UNICODE)

In [240]:
m.string       # The string passed to match() or search()

'tempo'

In [229]:
m.groups()

()

In [189]:
m = re.match(r"(\w+) (\w+)", "Isaac Newton, physicist")
m

<_sre.SRE_Match object; span=(0, 12), match='Isaac Newton'>

In [190]:
m.group(0)       # The entire match

'Isaac Newton'

In [191]:
m.__getitem__(0)       # The entire match

'Isaac Newton'

In [192]:
m[0]       # The entire match

'Isaac Newton'

In [193]:
m.group(1)       # The first parenthesized subgroup

'Isaac'

In [194]:
m.__getitem__(1)       # The first parenthesized subgroup

'Isaac'

In [195]:
m[1]       # The first parenthesized subgroup

'Isaac'

In [196]:
m.group(2)       # The second parenthesized subgroup

'Newton'

In [197]:
m.__getitem__(2)       # The second parenthesized subgroup

'Newton'

In [198]:
m[2]       # The second parenthesized subgroup.

'Newton'

In [178]:
m.group(1, 2)    # Multiple arguments give us a tuple

('Isaac', 'Newton')

In [182]:
t = re.match(r"(?P<first_name>\w+) (?P<last_name>\w+)", "Malcolm Reynolds")

In [183]:
t.group('first_name')

'Malcolm'

In [184]:
t.group('last_name')

'Reynolds'

In [185]:
t.group(1)

'Malcolm'

In [186]:
t.group(2)

'Reynolds'

In [188]:
# If a group matches multiple times, only the last match is accessible
n = re.match(r"(..)+", "a1b2c3")  # Matches 3 times.
n.group(1)                        # Returns only the last match.

'c3'

In [271]:
p = re.compile('(ab)*')
print(p.match('ababababab').span())

(0, 10)


In [272]:
p = re.compile('(a)b')
m = p.match('ab')
m.group()

'ab'

In [273]:
m.group(0)

'ab'

In [275]:
p = re.compile('(a(b)c)d')
m = p.match('abcd')
m.group(0)

'abcd'

In [276]:
m.group(1)

'abc'

In [277]:
m.group(2)

'b'

In [278]:
m.group(2,1,2)

('b', 'abc', 'b')

In [279]:
m.groups()

('abc', 'b')

In [210]:
email = "tony@tiremove_thisger.net"
m = re.search("remove_this", email)
email[:m.start()] + email[m.end():]

'tony@tiger.net'

In [256]:
print(re.match(r'From\s+', 'Fromage amk'))

None


In [257]:
re.match(r'From\s+', 'From amk Thu May 14 19:12:10 1998')  

<_sre.SRE_Match object; span=(0, 5), match='From '>

In [266]:
p = re.compile(r'\bclass\b')            # matches class only when it’s a complete word
print(p.search('no class at all'))

<_sre.SRE_Match object; span=(3, 8), match='class'>


In [267]:
print(p.search('the declassified algorithm'))

None


In [268]:
print(p.search('one subclass is'))

None


In [269]:
p = re.compile('\bclass\b')    # without the r' it's the backspace character
print(p.search('no class at all'))

None


In [270]:
print(p.search('\b' + 'class' + '\b'))

<_sre.SRE_Match object; span=(0, 7), match='\x08class\x08'>


In [280]:
p = re.compile(r'\b(\w+)\s+\1\b')       # detects doubled words in a string
p.search('Paris in the the spring').group()

'the the'

In [286]:
p = re.compile(r'\b(?P<word>\w+)\s+(?P=word)\b')  # detects doubled words in a string
p.search('Paris in the the spring').group()

'the the'

In [281]:
p = re.compile(r'(?P<word>\b\w+\b)')     # using named groups
m = p.search( '(((( Lots of punctuation )))' )
m.group('word')

'Lots'

In [282]:
m.group(1)

'Lots'

### Performing matches - finditer()

In [253]:
# finditer() - Find all substrings where the RE matches, and returns them as an iterator

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

# segment email in its 3 components
# put parenthesis around parts of the pattern to segment
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

iterator = regex.finditer(text) # returns them with an iterator

for match in iterator:
    print(match)

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>
<_sre.SRE_Match object; span=(27, 42), match='steve@gmail.com'>
<_sre.SRE_Match object; span=(47, 60), match='rob@gmail.com'>
<_sre.SRE_Match object; span=(66, 80), match='ryan@yahoo.com'>


In [254]:
iterator = p.finditer('12 drummers drumming, 11 ... 10 ...')
iterator  

<callable_iterator at 0x1d98402b630>

In [255]:
for match in iterator:
    print(match.span())

(0, 2)
(22, 24)
(29, 31)


### Splitting

In [129]:
# split() - Split the string into a list, splitting it wherever the RE matches

text = "foo    bar\t baz  \tqux"
re.split('\s+', text) # split on one or more whitespaces

['foo', 'bar', 'baz', 'qux']

In [130]:
regex = re.compile('\s+') # compile regex
regex.split(text) # split using compilation

['foo', 'bar', 'baz', 'qux']

In [288]:
p = re.compile(r'\W+')             # delimiter is any sequence of non-alphanumeric characters
p.split('This is a test, short and sweet, of split().')

['This', 'is', 'a', 'test', 'short', 'and', 'sweet', 'of', 'split', '']

In [289]:
p.split('This is a test, short and sweet, of split().', 3)     # max splits = 3, remainder is returned as final element

['This', 'is', 'a', 'test, short and sweet, of split().']

In [292]:
p = re.compile(r'\W+')
p2 = re.compile(r'(\W+)')    # show what the delimiter were using capturing parentheses
p.split('This... is a test.')

['This', 'is', 'a', 'test', '']

In [293]:
p2.split('This... is a test.')

['This', '... ', 'is', ' ', 'a', ' ', 'test', '.', '']

In [294]:
re.split('[\W]+', 'Words, words, words.')

['Words', 'words', 'words', '']

In [295]:
re.split('([\W]+)', 'Words, words, words.')

['Words', ', ', 'words', ', ', 'words', '.', '']

In [296]:
re.split('[\W]+', 'Words, words, words.', 1)

['Words', 'words, words.']

### Substrings

In [287]:
# sub() - Find all substrings where the RE matches, and replace them with a different string
# subn() - Does the same thing as sub(), but returns the new string and the number of replacements

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}' # capable of identify most email addresses

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

print(regex.sub('REDACTED', text)) # return a string with matching pattern replaced

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [132]:
# segment email in its 3 components
# put parenthesis around parts of the pattern to segment
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text)) # access groups via symbols with sub

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



In [297]:
p = re.compile('(blue|white|red)')
p.sub('colour', 'blue socks and red shoes')

'colour socks and colour shoes'

In [298]:
p.sub('colour', 'blue socks and red shoes', count=1)

'colour socks and red shoes'

In [299]:
p.subn('colour', 'blue socks and red shoes')

('colour socks and colour shoes', 2)

In [300]:
p.subn('colour', 'no colours at all')

('no colours at all', 0)

In [302]:
p = re.compile('x*')   # Empty matches are replaced only when they’re not adjacent to a previous match
p.sub('-', 'abxd')

'-a-b-d-'

In [303]:
# matches the word section followed by a string enclosed in {, }, and changes section to subsection
p = re.compile('section{ ( [^}]* ) }', re.VERBOSE)
p.sub(r'subsection{\1}','section{First} section{second}')

'subsection{First} subsection{second}'

In [304]:
p = re.compile('section{ (?P<name> [^}]* ) }', re.VERBOSE)
p.sub(r'subsection{\1}','section{First}')

'subsection{First}'

In [305]:
p.sub(r'subsection{\g<1>}','section{First}')

'subsection{First}'

In [306]:
p.sub(r'subsection{\g<name>}','section{First}')

'subsection{First}'

In [307]:
def hexrepl(match):
    "Return the hex string for a decimal number"
    value = int(match.group())
    return hex(value)

p = re.compile(r'\d+')
p.sub(hexrepl, 'Call 65490 for printing, 49152 for user code.')

'Call 0xffd2 for printing, 0xc000 for user code.'