#### Outline

In [94]:
import re

#### match()

In [35]:
mytext = 'learn to implement regex'

In [40]:
'learn' in mytext # it doesn't help a lot while performing manipulation with the string, so use regex

True

In [87]:
result = re.match(r'learn', mytext) # this restricts us to find the matches in the very begining of the text
result.group()

'learn'

In [81]:
result.string

'learn to implement regex'

In [84]:
result.pos, result.endpos

(0, 24)

In [74]:
print('Start position of matching pattern', str(result.start()))

Start position of matching pattern 0


In [33]:
print('End position of matching pattern', str(result.end()))

End position of matching pattern 5


#### search()

In [90]:
result = re.search(r'regex', mytext)
result.group()

'regex'

In [92]:
print('Start position of matching pattern', str(result.start()))

Start position of matching pattern 19


In [93]:
print('End position of matching pattern', str(result.end()))

End position of matching pattern 24


#### findall()

In [144]:
mytext = 'learn to implement regex in fast way'

In [145]:
result = re.findall(r'learn', mytext)
result # it becomes list

['learn']

#### split()

In [146]:
re.split(r'rn', mytext)

['lea', ' to implement regex in fast way']

#### sub()

In [152]:
result = re.sub(r' in fast way',' by myself', mytext)
result

'learn to implement regex by myself'

#### compile()

In [153]:
# this part will use pattern operator (meta  and literal characters) not fixed-string

In [154]:
pattern = re.compile(r'l.{1,4} ')
result = pattern.findall(mytext)
result

['learn ']

#### Very Common Regex Pattern

<img src='./very-common-regex-pattern.png' width='750' height='1000' />

#### Fun with Regex!

In [155]:
result = re.findall(r'.', mytext)
print(result)

['l', 'e', 'a', 'r', 'n', ' ', 't', 'o', ' ', 'i', 'm', 'p', 'l', 'e', 'm', 'e', 'n', 't', ' ', 'r', 'e', 'g', 'e', 'x', ' ', 'i', 'n', ' ', 'f', 'a', 's', 't', ' ', 'w', 'a', 'y']


In [156]:
# remove white spce
result = re.findall(r'\w', mytext) # \w alphanumeric character whereas \W matches non alphanumeric character
print(result)

['l', 'e', 'a', 'r', 'n', 't', 'o', 'i', 'm', 'p', 'l', 'e', 'm', 'e', 'n', 't', 'r', 'e', 'g', 'e', 'x', 'i', 'n', 'f', 'a', 's', 't', 'w', 'a', 'y']


In [157]:
# extract each word
result = re.findall(r'\w*', mytext)
print(result)

['learn', '', 'to', '', 'implement', '', 'regex', '', 'in', '', 'fast', '', 'way', '']


In [158]:
# extract each word without white space
result = re.findall(r'\w+', mytext)
print(result)

['learn', 'to', 'implement', 'regex', 'in', 'fast', 'way']


In [159]:
# extract only the first word
result = re.findall(r'^\w+', mytext)
print(result)

['learn']


In [160]:
# extract only the last word
result = re.findall(r'\w+$', mytext)
print(result)

['way']


In [163]:
# extract only the first two/ three characters
result = re.findall(r'\w\w\w', mytext)
print(result)

['lea', 'imp', 'lem', 'ent', 'reg', 'fas', 'way']


In [167]:
# extract only the first two/ three characters
result = re.findall(r'\w.', mytext)
print(result)

['le', 'ar', 'n ', 'to', 'im', 'pl', 'em', 'en', 't ', 're', 'ge', 'x ', 'in', 'fa', 'st', 'wa']


In [168]:
# extract only the first two/ three characters with boundary
result = re.findall(r'\b\w.', mytext)
print(result)

['le', 'to', 'im', 're', 'in', 'fa', 'wa']


In [173]:
email_list = 'abc.test@gmail.com, xyz@test.in, test.first@analyticsvidhya.com, first.test@rest.biz'

In [174]:
# extract all characters after @
result = re.findall(r'@\w+', email_list) 
print(result)

['@gmail', '@test', '@analyticsvidhya', '@rest']


In [179]:
# also extract the domain
result = re.findall(r'@\w+.\w+', email_list)
print(result)

['@gmail.com', '@test.in', '@analyticsvidhya.com', '@rest.biz']


In [180]:
# extract only domain
result = re.findall(r'@\w+(.\w+)', email_list)
print(result)

['.com', '.in', '.com', '.biz']


In [193]:
date_list = 'Amit 34-3456 12-05-2007, XYZ 56-4532 11-11-2011, ABC 67-8945 12-01-2009'

In [195]:
# extract the date
result = re.findall(r'\d{2}-\d{2}-\d{4}', date_list)
print(result) 

['12-05-2007', '11-11-2011', '12-01-2009']


In [197]:
# extract only the years
result = re.findall(r'\d{2}-\d{2}-(\d{4})', date_list)
print(result)

['2007', '2011', '2009']


In [200]:
# return words start with vowel
result = re.findall(r'[aieuoAEIOU]\w+', mytext)
print(result)

['earn', 'implement', 'egex', 'in', 'ast', 'ay']


In [202]:
# return only words start with vowel using boundary \b
result = re.findall(r'\b[aieuoAEIOU]\w+', mytext)
print(result)

['implement', 'in']


In [206]:
# return words start with constant using ^
result = re.findall(r'\b[^aieuoAEIOU]\w+', mytext)
print(result)

['learn', ' to', ' implement', ' regex', ' in', ' fast', ' way']


In [207]:
# return words start with constant using ^ and white space
result = re.findall(r'\b[^aieuoAEIOU ]\w+', mytext)
print(result)

['learn', 'to', 'regex', 'fast', 'way']


In [210]:
# validate phone numbers
phone_list = ['9999999999','999999-999','99999x9999']

for phone in phone_list:
    if re.match(r'[8-9]{1}[0-9]{9}', phone) and len(phone) == 10:
        print('valid!')
    else:
        print('no')

valid!
no
no


In [211]:
# split string with multiple delimiters
line = 'asdf fjdk;afed,fjek,asdf,foo'
result = re.split(r'[;,\s]', line)
print(result)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']


In [None]:
# replace multiple delimiters with white space

In [216]:
result = re.sub(r'[;,\s]',' ', line)
print(result)

asdf fjdk afed fjek asdf foo


In [220]:
# extract HTML information (can be retrieved using urllib2)
html_text = """
<tr align="center"><td>1</td> <td>Noah</td> <td>Emma</td></tr>
<tr align="center"><td>2</td> <td>Liam</td> <td>Olivia</td></tr>
<tr align="center"><td>3</td> <td>Mason</td> <td>Sophia</td></tr>
<tr align="center"><td>4</td> <td>Jacob</td> <td>Isabella</td></tr>
<tr align="center"><td>5</td> <td>William</td> <td>Ava</td></tr>
<tr align="center"><td>6</td> <td>Ethan</td> <td>Mia</td></tr>
<tr align="center"><td>7</td> <td HTML>Michael</td> <td>Emily</td></tr>
"""

In [221]:
result = re.findall(r'<td>\w+</td>\s<td>(\w+)</td>\s<td>(\w+)</td>', html_text)
print(result)

[('Noah', 'Emma'), ('Liam', 'Olivia'), ('Mason', 'Sophia'), ('Jacob', 'Isabella'), ('William', 'Ava'), ('Ethan', 'Mia')]


#### That's all!

<hr/>