# Regular Expressions for Lexical Processing

In [1]:
import re

### Seatch for patterns

In [2]:
re.search("Lohith", "My best friend is Lohith")

<re.Match object; span=(18, 24), match='Lohith'>

In [3]:
match=re.search("Lohith", "My best friend is Lohith")
match.group()

'Lohith'

### Quantifiers

In [4]:
def pattern_search(pattern,text):
    print(re.search(pattern,text))

#### * tells either {) {a} {ab} {abb} {abbb..} since * is behind b, b will be repetative

In [130]:
pattern_search("ab*","ac")
pattern_search("ab*","abc")
pattern_search("abb*","abbc")
pattern_search("ac*","bd")

<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 2), match='ab'>
<re.Match object; span=(0, 3), match='abb'>
None


#### ? tells either {} {all letters inside}, eg for last is {} {a} {ab}

In [16]:
pattern_search("a?","ac")
pattern_search("a?","bc")
pattern_search("ab?","abc")
pattern_search("abc?","abcc")
pattern_search("ab?","ac")

<re.Match object; span=(0, 1), match='a'>
<re.Match object; span=(0, 0), match=''>
<re.Match object; span=(0, 2), match='ab'>
<re.Match object; span=(0, 3), match='abc'>
<re.Match object; span=(0, 1), match='a'>


#### + means either {a} {aa} {aaa...}

In [171]:
pattern_search("a+","abc")
pattern_search("a+","bcd")
pattern_search("a+","aab")
pattern_search("ab+","abbb")
pattern_search("b+","abbc")

<re.Match object; span=(0, 1), match='a'>
None
<re.Match object; span=(0, 2), match='aa'>
<re.Match object; span=(0, 4), match='abbb'>
<re.Match object; span=(1, 3), match='bb'>


#### ab{2} means b repeating exactly 2 times

In [23]:
pattern_search("ab{2}","ab")
pattern_search("ab{2}","abb")

None
<re.Match object; span=(0, 3), match='abb'>


#### ab{2,5} means b occuring with minimum of 2 times and maximum of 5 times

In [30]:
pattern_search("b{2,5}","bbbb")
pattern_search("ab{3,6}","abababab")
pattern_search("ab{3,6}","abbbb")

<re.Match object; span=(0, 4), match='bbbb'>
None
<re.Match object; span=(0, 5), match='abbbb'>


### Anchors

#### ^A indicates check if the word is starting from A
#### $B indicates if the word is ending with character B

In [42]:
pattern_search("^J","Johny")
pattern_search("e$","Google")
pattern_search("^S","Pratheek")

<re.Match object; span=(0, 1), match='J'>
<re.Match object; span=(5, 6), match='e'>
None


### Wildcards

In [123]:
pattern_search(".","abcd")


<re.Match object; span=(0, 1), match='a'>


### Whitespaces

In [68]:
pattern_search(" +","Loh ith")

<re.Match object; span=(3, 4), match=' '>


### Other searches

In [43]:
pattern_search("(abc)","abcabcabc")

<re.Match object; span=(0, 3), match='abc'>


In [44]:
pattern_search("(abc){1,3}","abcabcabcabc")

<re.Match object; span=(0, 9), match='abcabcabc'>


In [45]:
pattern_search("(001)+","001001001")

<re.Match object; span=(0, 9), match='001001001'>


In [54]:
pattern_search("(SBI|HDFC) bank","SBI and HDFC bank are good")
pattern_search("(SBI|HDFC) bank","SBI bank HDFC bank are good")

<re.Match object; span=(8, 17), match='HDFC bank'>
<re.Match object; span=(0, 8), match='SBI bank'>


In [70]:
pattern_search("\(100\)","101(100)")

<re.Match object; span=(3, 8), match='(100)'>


In [77]:
pattern_search(".\+.\=.+","7+7=14")
pattern_search(".\%.\=.+","3%2=1")
pattern_search(".+\*.+","3a*4b")
pattern_search(".\*.\*.\=.+","4*5*6=120")

<re.Match object; span=(0, 6), match='7+7=14'>
<re.Match object; span=(0, 5), match='3%2=1'>
<re.Match object; span=(0, 5), match='3a*4b'>
<re.Match object; span=(0, 9), match='4*5*6=120'>


### Flags

#### Ignore case sensitivity by using flags=re.I
#### re.M for multiple lines

In [79]:
re.search("lohith","LOHITH",flags=re.I)

<re.Match object; span=(0, 6), match='LOHITH'>

### Character Search

#### [a-c] means any characters falling between a and c

In [86]:
pattern_search("[a-c]","lohith c n")
re.search("[a-c]","LOHITH C N",flags=re.I)

<re.Match object; span=(7, 8), match='c'>


<re.Match object; span=(7, 8), match='C'>

#### Want to select everything other than a-c

In [95]:
pattern_search("[^a-c]","Lohith")

<re.Match object; span=(0, 1), match='L'>


### Shortcuts

#### \s White spaces 
#### \S Other than white spaces
#### \d [0-9]
#### \D Other than [0-9]
#### \w [a-z A-z 0-9]
#### \W Other than [a-z A-z 0-9]

In [100]:
pattern_search("[a-z]+ed","ted bed red")

<re.Match object; span=(0, 3), match='ted'>


In [107]:
pattern_search("[a-z A-Z]{1,10}\d{4}","Lohtih1234")

<re.Match object; span=(0, 10), match='Lohtih1234'>


#### Any character of length 3-10,can include letter,number underscore

In [127]:
pattern_search("\w{3,10}\_.+","lohith_204")

<re.Match object; span=(0, 10), match='lohith_204'>


### Non Greedy Approach (Will fetch 1st match if ? is included at last)

In [167]:
pattern_search("bat*?","batsman")

<re.Match object; span=(0, 2), match='ba'>


In [170]:
pattern_search("bat{1,3}?","battsman")

<re.Match object; span=(0, 3), match='bat'>


### Other RE functions

#### re.match  this will match at the beginning of the string
#### re.search  this will search anywhere in the string
#### re.sub is find and replace (find pattern,replace text,data)
#### re.findall will find all patterns that are matched

In [172]:
street="21 Ramakrishna Road"

In [184]:
re.sub("Ra\w+","rd",street)

'21 rd Road'

### Finditer (to get start and end positions)

In [185]:
word="Rose is red in color, Taj mahal is white in color"
pattern="color"

In [195]:
for match in re.finditer(pattern,word):
    print("Start:",match.start(),end=" ")
    print("End:",match.end())

Start: 15 End: 20
Start: 44 End: 49
