### Regular Expression

- `re.search` searches for the first match anywhere in the string. 
- `re.match` searches for the first match at the very start of the string
- `re.findall` searches for all the matches anywhere in the string

In [11]:
import re
f = open('file.txt','r') #content is 'amayank'
content = f.read()
a = re.search('mayank',content) #only first match. Anywhere in the string
a.group()

'mayank'

In [8]:
import re
f = open('file.txt','r')   #content is 'amayank'
content = f.read()
a = re.match('mayank',content) #only first match. Start of the string
print(a)

None


In [12]:
import re
f = open('file.txt','r')
content = f.read()
a = re.search('ayank',content) #only first match. Anywhere in the string
a.group()

'ayank'

In [13]:
a = re.findall('mayank',content) #all instances of match
print(a)

['mayank']


In [24]:
a = re.findall('ayank',content) #all instances of match
print(a)

['ayank', 'ayank']


In [72]:
f = open('file.txt','r')
content = f.readlines() #list of strings
a = re.match('my', content[0]) #content[0] is first line as string
a.group()

'my'

In [75]:
f = open('file.txt','r')
content = f.read() #entire file content in a single string
a = re.match('my', content)
a.group()

'my'

In [76]:
type(content)

str

### RE Patterns

In [18]:
# . mathces any character

content = 'cookies'
a = re.findall('coo..es', content) #matched
a

['cookies']

In [77]:
content

"my name is mayank\nI am an engineer\nI live in Delhi\nI'm married\nI like programming\nmayankk here!\nmayank here!!\na\naaa\nabab\naab\nababbb\nabcbca\nabracadabra\n\n\n"

In [84]:
# ^m -> match 'm' at the start of the string

a = re.findall('^m', content) #matched 
print(a)
b = re.findall('^m', 'xyzmzm') #no match
print(b)

['m']
[]


In [83]:
# m$ -> match 'm' at the end of the string
a = re.findall('m$', content) #no match
print(a)
b = re.findall('a$', 'bca') #matched
print(b)

[]
['a']


In [87]:
# m.. -> will match 3 character string starting with m in anywhere in the string. Newline doesn't count as a character

a = re.findall('m..','ma\n') # '\n' doesn't count as a character so no match
print(a)
b = re.findall('m..', 'mayank') #matched
print(b)
c = re.findall('m..','xyzmayank') #will search anywhere in the string, hence matched.
print(c)

[]
['may']
['may']


In [89]:
# ab* will match a and 0 or more occurence of b. a, ab, abb and so on will be matched. Anyuhere in the string. 

items = ['a','ab','abb','bab','xyz']
for i in items:
    a = re.findall('ab*', i)
    print(a)

['a']
['ab']
['abb']
['ab']
[]


In [91]:
# ab+ will match a and followed by 1 or more occurence of b.  ab, abb and so on will be matched. Anyuhere in the string. 

items = ['a','ab','abb','bab','xyz']
for i in items:
    a = re.findall('ab+', i)
    print(a)

[]
['ab']
['abb']
['ab']
[]


In [92]:
# ab? will match a and followed by 0 or 1  occurence of b.  a and ab will be matched but not abb.  Anyuhere in the string. 

items = ['a','ab','abb','bab','xyz']
for i in items:
    a = re.findall('ab?', i)
    print(a)

['a']
['ab']
['ab']
['ab']
[]


In [108]:
# \s -> matches whitespace character
items = 'abambammbammmb\naccccccccccccccccccccb'
a = re.findall('\s', items)
a

['\n']

In [110]:
%pprint
# \S -> matches all but whitespace character
items = 'abambammbammmb\naccccccccccccccccccccb'
a = re.findall('\S', items)
a

Pretty printing has been turned OFF


['a', 'b', 'a', 'm', 'b', 'a', 'm', 'm', 'b', 'a', 'm', 'm', 'm', 'b', 'a', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'b']

In [111]:
# \W -> matches all but word characters (0 to 9, a to z, A to Z, _)
items = 'abambammbammmb\naccccccccccccccccccccb'
a = re.findall('\W', items)
a

['\n']

In [112]:
# \w -> matches all word characters (0 to 9, a to z, A to Z, _)
items = 'ammmb\naccccb'
a = re.findall('\w', items)
a

['a', 'm', 'm', 'm', 'b', 'a', 'c', 'c', 'c', 'c', 'b']

In [16]:
# [a-zA-Z0-9] Matches any letter from (a to z) or (A to Z) or (0 to 9). 
#Characters that are not within a range can be matched by complementing the set.
#If the first character of the set is ^, all the characters that are not in the set will be matched.

item1 = '$$$$%^$^&&&&'
item2 = 'a%'
a = re.findall('[a-zA-Z0-9]',item1)
b = re.findall('[a-zA-Z0-9]',item2)
c = re.findall('[^a-zA-Z0-9]',item1)
a,b,c

([], ['a'], ['$', '$', '$', '$', '%', '^', '$', '^', '&', '&', '&', '&'])

In [113]:
# [ab] -> matches either a or b
items = 'abambammbammmb\naccccccccccccccccccccb'
a = re.findall('[ab]', items)
a

['a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b']

In [114]:
# [0-9] -> matches any digit in this range
items = 'abambammbammmb\naccccccccccccccccccccb0i1'
a = re.findall('[0-9]', items)
a

['0', '1']

In [115]:
# [0-5][0-9] -> matches any digit in the range[00,59]
items = 'abambammbammmb\naccccccccccccccccccccb0i1a55'
a = re.findall('[0-5][0-9]', items)
a

['55']

In [123]:
# A|B -> match either regex A or B. If A matched, B isn't tried.
items = 'abambammbammmb\naccccccccccccccccccccb0i1'
a = re.findall('[0-9]|[b]', items)
a
#didn't get it?

['b', 'b', 'b', 'b', 'b', '0', '1']