# Regular Expressions

### 14.1 The re Module

In [1]:
import re

In [2]:
text = "This text contains 14.67 floating point -2.34 numbers"

In [3]:
pattern = r"\d+\.?\d+"

In [4]:
re.search(pattern, text)

<re.Match object; span=(19, 24), match='14.67'>

In [5]:
re.findall(pattern, text)

['14.67', '2.34']

In [6]:
pattern = r"[-+]?\d+\.?\d+"

In [7]:
re.findall(pattern, text)

['14.67', '-2.34']

In [8]:
re.finditer(pattern, text)

<callable_iterator at 0x1799bf2ddb0>

In [11]:
for m in re.finditer(pattern, text):
    print(m.group())
    print(m.span())

14.67
(19, 24)
-2.34
(40, 45)


In [13]:
text[19:24], text[40:45]

('14.67', '-2.34')

In [18]:
pattern = r"(?P<BeforeDot>[-+]?\d+)\.?(?P<AfterDot>\d+)"

In [21]:
for m in re.finditer(pattern, text):
    print(m.groups())
    print(m.span(), m.start(), m.end())

('14', '67')
(19, 24) 19 24
('-2', '34')
(40, 45) 40 45


In [20]:
for m in re.finditer(pattern, text):
    print(m.groupdict())
    print(m.span())

{'BeforeDot': '14', 'AfterDot': '67'}
(19, 24)
{'BeforeDot': '-2', 'AfterDot': '34'}
(40, 45)


In [23]:
re.subn(pattern, "FPN", text)

('This text contains FPN floating point FPN numbers', 2)

### 14.2 Metacharecters

In [25]:
import re
data = ['ab ', 'abc', 'a5e', 'a6f', '123 a6c anc', 'a5b', 'a55b', 'a555b', 'a5555b',
        'a55555b', 'a555555b', 'a5xb', '1/4', '3+2=5', 'def ghi', 'abc ab']
for item in data:
	m = re.search(r'a.c', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

abc matched in 'abc'
a6c matched in '123 a6c anc'
abc matched in 'abc ab'


In [26]:
import re
data = ['ab ', 'abc', 'a5e', 'a6f', '123 a6c anc', 'a5b', 'a55b', 'a555b', 'a5555b',
        'a55555b', 'a555555b', 'a5xb', '1/4', '3+2=5', 'def ghi', 'abc ab']
for item in data:
	m = re.search(r'a[abc]c', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

abc matched in 'abc'
abc matched in 'abc ab'


In [27]:
import re
data = ['ab ', 'abc', 'a5e', 'a6f', '123 a6c anc', 'a5b', 'a55b', 'a555b', 'a5555b',
        'a55555b', 'a555555b', 'a5xb', '1/4', '3+2=5', 'def ghi', 'abc ab']
for item in data:
	m = re.search(r'a[0-9]c', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

a6c matched in '123 a6c anc'


In [29]:
import re
data = ['ab ', 'abc', 'a5e', 'a6f', '123 a6c anc', 'a5b', 'a55b', 'a555b', 'a5555b',
        'a55555b', 'a555555b', 'a5xb', '1/4', '3+2=5', 'def ghi', 'abc ab']
for item in data:
	m = re.search(r'3\+2', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

3+2 matched in '3+2=5'


In [30]:
import re
data = ['ab ', 'abc', 'a5e', 'a6f', '123 a6c anc', 'a5b', 'a55b', 'a555b', 'a5555b',
        'a55555b', 'a555555b', 'a5xb', '1/4', '3+2=5', 'def ghi', 'abc ab']
for item in data:
	m = re.search(r'a.c|3\+2', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

abc matched in 'abc'
a6c matched in '123 a6c anc'
3+2 matched in '3+2=5'
abc matched in 'abc ab'


In [31]:
import re
data = ['ab ', 'abc', 'a5e', 'a6f', '123 a6c anc', 'a5b', 'a55b', 'a555b', 'a5555b',
        'a55555b', 'a555555b', 'a5xb', '1/4', '3+2=5', 'def ghi', 'abc ab']
for item in data:
	m = re.search(r'a\dc', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

a6c matched in '123 a6c anc'


In [32]:
import re
data = ['ab ', 'abc', 'a5e', 'a6f', '123 a6c anc', 'a5b', 'a55b', 'a555b', 'a5555b',
        'a55555b', 'a555555b', 'a5xb', '1/4', '3+2=5', 'def ghi', 'abc ab']
for item in data:
	m = re.search(r'a\wc', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

abc matched in 'abc'
a6c matched in '123 a6c anc'
abc matched in 'abc ab'


In [33]:
import re
data = ['ab ', 'abc', 'a5e', 'a6f', '123 a6c anc', 'a5b', 'a55b', 'a555b', 'a5555b',
        'a55555b', 'a555555b', 'a5xb', '1/4', '3+2=5', 'def ghi', 'abc ab']
for item in data:
	m = re.search(r'a\Sc', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

abc matched in 'abc'
a6c matched in '123 a6c anc'
abc matched in 'abc ab'


In [34]:
import re
data = ['ab ', 'abc', 'a5e', 'a6f', '123 a6c anc', 'a5b', 'a55b', 'a555b', 'a5555b',
        'a55555b', 'a555555b', 'a5xb', '1/4', '3+2=5', 'def ghi', 'abc ab']
for item in data:
	m = re.search(r'a6', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

a6 matched in 'a6f'
a6 matched in '123 a6c anc'


In [35]:
import re
data = ['ab ', 'abc', 'a5e', 'a6f', '123 a6c anc', 'a5b', 'a55b', 'a555b', 'a5555b',
        'a55555b', 'a555555b', 'a5xb', '1/4', '3+2=5', 'def ghi', 'abc ab']
for item in data:
	m = re.search(r'^a6', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

a6 matched in 'a6f'


In [36]:
for item in data:
	m = re.search(r'..c', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

abc matched in 'abc'
a6c matched in '123 a6c anc'
abc matched in 'abc ab'


In [37]:
for item in data:
	m = re.search(r'..c$', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

abc matched in 'abc'
anc matched in '123 a6c anc'


In [38]:
for item in data:
	m = re.search(r'ab\b', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

ab matched in 'ab '
ab matched in 'abc ab'


In [39]:
for item in data:
	m = re.search(r'ab\B', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

ab matched in 'abc'
ab matched in 'abc ab'


In [40]:
for item in data:
	m = re.search(r'a555*b', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

a55b matched in 'a55b'
a555b matched in 'a555b'
a5555b matched in 'a5555b'
a55555b matched in 'a55555b'
a555555b matched in 'a555555b'


In [41]:
for item in data:
	m = re.search(r'a555+b', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

a555b matched in 'a555b'
a5555b matched in 'a5555b'
a55555b matched in 'a55555b'
a555555b matched in 'a555555b'


In [42]:
for item in data:
	m = re.search(r'a5?\D', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

ab matched in 'ab '
ab matched in 'abc'
a5e matched in 'a5e'
an matched in '123 a6c anc'
a5b matched in 'a5b'
a5x matched in 'a5xb'
ab matched in 'abc ab'


In [43]:
for item in data:
	m = re.search(r'a55{2,4}\D', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

a555b matched in 'a555b'
a5555b matched in 'a5555b'
a55555b matched in 'a55555b'


In [44]:
for item in data:
	m = re.search(r'a55{2,}\D', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

a555b matched in 'a555b'
a5555b matched in 'a5555b'
a55555b matched in 'a55555b'
a555555b matched in 'a555555b'


In [45]:
for item in data:
	m = re.search(r'a55{,4}\D', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

a5e matched in 'a5e'
a5b matched in 'a5b'
a55b matched in 'a55b'
a555b matched in 'a555b'
a5555b matched in 'a5555b'
a55555b matched in 'a55555b'
a5x matched in 'a5xb'


In [46]:
for item in data:
	m = re.search(r'a555{3}\D', item)
	if m:
		print (m.group() + ' matched in ' + '\'' + item + '\'')

a55555b matched in 'a55555b'


### 14.3 Exercises

##### Regular expression to detect a complex number in a block of text

In [49]:
pattern = r"[-+]?\d*\.?\d+[-+]\d*\.?\d+j"
text = "This statement consists of a complex -4.5+5.1j number"

In [50]:
re.search(pattern, text)

<re.Match object; span=(37, 46), match='-4.5+5.1j'>

##### Quick Test:

Write a regular expression to detect if the text has an equation of the form a + b = c

##### Quick Test:

Write a regular expression to check if the text has a HTML bold tag "<b></b>"

##### Quick Test:

Write a regular expressiont to detect if a word is repeated multiple times in a text and if so, 
replace it with a single occurance.

Exmaple:
It was a dark dark dark night -> It was a dark night
