### Regex - 
The Regex or Regular Expression is a way to define a pattern for searching or manipulating strings. We can use a regular expression to match, search, replace, and manipulate inside textual data.


* This module exports the following functions:
        1. match     Match a regular expression pattern to the beginning of a string.
        2. search    Search a string for the presence of a pattern.
        3. sub       Substitute occurrences of a pattern found in a string.
        4. subn      Same as sub, but also return the number of substitutions made.
        5. split     Split a string by the occurrences of a pattern.
        6. findall   Find all occurrences of a pattern in a string.
        7. compile   Compile a pattern into a Pattern object.
        
        
*           Searching and replacing text in files
            Validating text input, such as password and email address
            Rename a hundred files at a time. For example, You can change the extension of all files using a regex pattern
            


In [None]:
# \w – matches a word character
# \d – matches digit character
# \s – matches whitespace character (space, tab, newline, etc.)


<!-- 
            \w – matches a word character
            \d – matches digit character
            \s – matches whitespace character (space, tab, newline, etc.)
            \b – matches a zero-length character -->
                                                                    

<h1>  Regex Metacharacters and Operators </h1>

In [2]:
import re

In [3]:


target_string = "Emma is a Python developer \n Emma also knows ML and AI"

# caret (^) matches at the beginning of a string
result = re.search(r"^\w{4}", target_string)
print(result.group())



Emma


In [11]:


str1 = "Emma is a Python developer \nEmma also knows ML and AI"
# dollar sign($) to match at the end of the string
result = re.search(r"\w{2}$", str1)
print(result.group())



AI


In [17]:


str1 = "Numbers are 8,23, 886, 4567, 78453"
# asterisk sign(*) to match 0 or more repetitions

result = re.findall(r"\d\d*", str1)
print(result)


['8', '23', '886', '4567', '78453']


In [18]:


str1 = "Emma is a Python developer. Emma also knows ML and AI."
res = re.findall(r"[edk]", str1)
print(res)



['d', 'e', 'e', 'e', 'k', 'd']


In [19]:
target_string = "Emma is a basketball player who was born on June 17"
result = re.match(r"\w{4}", target_string) 

# printing the Match object
print("Match object: ", result)


# Extract match value
print("Match value: ", result.group())



Match object:  <re.Match object; span=(0, 4), match='Emma'>
Match value:  Emma


In [20]:
s = "Welcome to   Regex    Programming   using   Python"

print(f'the value of s               : {s}')

lstVal = re.split(r'\s', s)
#\s only one space
print(f'Regex Split value of s        :{lstVal}')

lstVal2 = re.split(r'\s+', s)
#\s+ space one or more
print(f'Regex Split value of s        :{lstVal2}')

the value of s               : Welcome to   Regex    Programming   using   Python
Regex Split value of s        :['Welcome', 'to', '', '', 'Regex', '', '', '', 'Programming', '', '', 'using', '', '', 'Python']
Regex Split value of s        :['Welcome', 'to', 'Regex', 'Programming', 'using', 'Python']


In [21]:
s = "Welcome to   Regex    Programming   using   Python"

print(f'the value of s               : {s}')

lstVal = re.split(r'[a-f]', s)
#[a-f] - from a to f ( a,b,c,d,e,f,)
print(f'Regex Split value of s        :{lstVal}')

the value of s               : Welcome to   Regex    Programming   using   Python
Regex Split value of s        :['W', 'l', 'om', ' to   R', 'g', 'x    Progr', 'mming   using   Python']


In [29]:
lstValues = re.split(r"[a-f,l-n]", s)
# [a-f] -> a b c d e f
# [l-m] -> l m n
# al am an bl bm bn cl cm cn dl dm dn el em en fl fm fn

In [30]:
lstValues
# el am

['W',
 '',
 '',
 'o',
 '',
 ' to   R',
 'g',
 'x    Progr',
 '',
 '',
 'i',
 'g   usi',
 'g   Pytho',
 '']

##  Extract the numbers

In [31]:
address = "78 Hi 11    89 Main, 4th Cross, 123, Road, Marathalli, 5678 Bangalore, 560023 67893"

In [32]:
add_nos = re.findall(r'\d+', address)

print(f'sorting the only nums from address {add_nos}')

sorting the only nums from address ['78', '11', '89', '4', '123', '5678', '560023', '67893']


In [33]:
## now i want nos with 6 digit only
# REAL LIFE APPLICATION - EXTRACTING THE PIN CODES

add_dig = re.findall(r'\d{6}', address)

print(f'sorting the only nums from address {add_dig}')

sorting the only nums from address ['560023']


In [34]:
## now i want nos with 1-3 digit only

add_digs = re.findall(r'\d{1,3}', address)

print(f'sorting the only nums from address {add_digs}')

sorting the only nums from address ['78', '11', '89', '4', '123', '567', '8', '560', '023', '678', '93']


In [36]:

s = '''
<html>
<head>
<title>Current IP Address Allocations
</title>
</head>
<body>
IP Address are 172.45.78.109
LoopBack Address: 127.0.0.1
Computer 1: 10.67.89.101
Computer 2: 11.67.98.102
Computer 3: 12.68.98.102
</body>
</html>
'''

In [37]:
ip_s=re.findall(r'\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}',s)

print(f'ip address are -: {ip_s}')

ip address are -: ['172.45.78.109', '127.0.0.1', '10.67.89.101', '11.67.98.102', '12.68.98.102']


In [38]:
### 10  0r 11
ip_s1=re.findall(r"1[0-1]\.\d{1,3}\.\d{1,3}\.\d{1,3}", s)
print(f'ip address are -: {ip_s1}')

ip address are -: ['10.67.89.101', '11.67.98.102']


In [39]:
### 10  0r 11
ip_s1=re.findall(r"1[0|1]\.\d{1,3}\.\d{1,3}\.\d{1,3}", s)
print(f'ip address are -: {ip_s1}')

ip address are -: ['10.67.89.101', '11.67.98.102']


In [40]:
### 10  0r 11
ip_s1=re.findall(r"1[01]\.\d{1,3}\.\d{1,3}\.\d{1,3}", s)
print(f'ip address are -: {ip_s1}')

ip address are -: ['10.67.89.101', '11.67.98.102']


In [41]:
### 10  only
ip_s0=re.findall(r"10\.\d{1,3}\.\d{1,3}\.\d{1,3}", s)
print(f'ip address are -: {ip_s0}')

ip address are -: ['10.67.89.101']


In [42]:
print("Find all matches for format Month day")

matches = re.findall(r"[A-Z][a-z]+\s\d{1,2}", "These are the match dates June 24, August 9, Dec 12")
print(f'gives Month Date format - {matches}')

matches = re.findall(r"[A-Z][a-z]+\s(\d{1,2})", "These are the match dates June 24, August 9, Dec 12")
print(f'gives Date format - {matches}')

matches = re.findall(r"([A-Z][a-z]+)\s(\d{1,2})", "These are the match dates June 24, August 9, Dec 12")
print(f'gives tuple of Month & Date format - {matches}')

Find all matches for format Month day
gives Month Date format - ['June 24', 'August 9', 'Dec 12']
gives Date format - ['24', '9', '12']
gives tuple of Month & Date format - [('June', '24'), ('August', '9'), ('Dec', '12')]


In [43]:
s = "purple alice@google.com abcde helloab@abc.com ---@gmail.com 23@gmail.com my23@gmail.com _@gmail.com"

emails = re.findall(r"", s)
print(emails)

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']


In [44]:
s = "purple alice@google.com abcde helloab@abc.com ---@gmail.com 23@gmail.com my23@gmail.com _@gmail.com"

emails = re.findall(r"\w+@\w+\.\w+", s)
print(emails)

['alice@google.com', 'helloab@abc.com', '23@gmail.com', 'my23@gmail.com', '_@gmail.com']


In [45]:
s = "purple alice@google.com abcde helloab@abc.com ---@gmail.com 23@gmail.com my23@gmail.com _@gmail.com"

emails = re.findall(r"[A-Za-z]+@\w+\.\w+", s)
print(f'starts with alphabets only {emails}')

# \w => A-Za-z0-9_

starts with alphabets only ['alice@google.com', 'helloab@abc.com']


In [46]:
s = "purple alice@google.com abcde helloab@abc.com ---@gmail.com 23@gmail.com my23@gmail.com _@gmail.com"

emails = re.findall(r"\d+@\w+\.\w+", s)
print(f'starts with alphabets only {emails}')

# \w => A-Za-z0-9_

starts with alphabets only ['23@gmail.com', '23@gmail.com']


## Different Functions in ReGex

### findall

    * Returns list of all the matches

In [49]:
st = 'In a world where you can be anything, be kind'

rslt = re.findall('e',st)
rslt

['e', 'e', 'e', 'e']

In [50]:
# empty list if pattern is not present
rslt1 = re.findall('xyz',st)
rslt1

[]

In [51]:
rslt2 = re.findall('be',st)
rslt2

['be', 'be']

In [52]:
# either b or e
rslt21 = re.findall('[be]',st)
rslt21

['e', 'e', 'b', 'e', 'b', 'e']

In [53]:
rslt5 = re.findall('wo........',st)
rslt5

['world wher']

In [54]:
# 3rd element should be rld
rslt6 = re.findall('..rld',st)
rslt6

['world']

In [55]:
# zero or more occurance of w - if not returns null string
rslt6 = re.findall('w*',st)
print(rslt6)

['', '', '', '', '', 'w', '', '', '', '', '', 'w', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']


In [56]:
#zero or more occurance of e - if not returns null string
new_st = 'Friend in need is friend in deed'

nr = re.findall('e*',new_st)
print(nr)

['', '', '', 'e', '', '', '', '', '', '', '', 'ee', '', '', '', '', '', '', '', '', 'e', '', '', '', '', '', '', '', 'ee', '', '']


In [57]:
#occurance of e followed by (zero or more occurance of 2nd e - if not returns null string)
new_st = 'Friend in need is friend in deed'

nr1 = re.findall('ee*',new_st)
print(nr1)

['e', 'ee', 'e', 'ee']


In [58]:
#occurance of e followed by (one or more occurance of 2nd e - if not returns null string)
new_st = 'Friend in need is friend in deed'

nr2 = re.findall('ee+',new_st)
print(nr2)

['ee', 'ee']


In [59]:
# exact occurances - {}

nr3 = re.findall('e{2}',new_st)
nr3

['ee', 'ee']

In [61]:
# exact occurances - {}

nr3 = re.findall('e{3}',new_st)
nr3

[]

In [62]:
# findall - spaces
nr4 = re.findall('\s',new_st)
nr4

[' ', ' ', ' ', ' ', ' ', ' ']

In [63]:
# findall - except spaces
nr5 = re.findall('\S',new_st)
print(nr5)

['F', 'r', 'i', 'e', 'n', 'd', 'i', 'n', 'n', 'e', 'e', 'd', 'i', 's', 'f', 'r', 'i', 'e', 'n', 'd', 'i', 'n', 'd', 'e', 'e', 'd']


In [64]:
# findall - digit
new_st2 = 'Friend in need is 23 friend in 453214 deed'
nr6 = re.findall('\d',new_st2)
print(nr6)

['2', '3', '4', '5', '3', '2', '1', '4']


In [65]:
# findall - digit 0ne or more
new_st2 = 'Friend in need is 23 friend in 453214 deed'
nr6 = re.findall('\d+',new_st2)
print(nr6)

['23', '453214']


In [66]:
# findall - except digit
new_st2 = 'Friend in need is 23 friend in 453214 deed'
nr6 = re.findall('\D',new_st2)
print(nr6)

['F', 'r', 'i', 'e', 'n', 'd', ' ', 'i', 'n', ' ', 'n', 'e', 'e', 'd', ' ', 'i', 's', ' ', ' ', 'f', 'r', 'i', 'e', 'n', 'd', ' ', 'i', 'n', ' ', ' ', 'd', 'e', 'e', 'd']


In [67]:
# findall - words ( removes spaces)
new_st2 = 'Friend in need is 23 friend in 453214 deed'
nr6 = re.findall('\w',new_st2)
print(nr6)

['F', 'r', 'i', 'e', 'n', 'd', 'i', 'n', 'n', 'e', 'e', 'd', 'i', 's', '2', '3', 'f', 'r', 'i', 'e', 'n', 'd', 'i', 'n', '4', '5', '3', '2', '1', '4', 'd', 'e', 'e', 'd']


In [68]:
# findall - except alphanumeric
new_st2 = 'Friend in need is 23 friend in 453214 deed'
nr6 = re.findall('\W',new_st2)
print(nr6)

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


In [69]:
# findall - sets - [23] one digit
new_st2 = 'Friend in need is 23 friend in 453214 deed'
nr6 = re.findall('[23]',new_st2) # 2,3
print(nr6)

['2', '3', '3', '2']


In [70]:
# findall - sets - [23][45] - 2 digit
new_st2 = 'Friend in need is 23 friend in 453214 deed'
nr6 = re.findall(r'[23][45]',new_st2)
print(nr6)

[]


In [71]:
# findall - sets
new_st2 = 'Friend in need is 23 friend in 453214 deed'
nr6 = re.findall('[3-9]',new_st2)
print(nr6)

['3', '4', '5', '3', '4']


### search

In [72]:
st = 'In a world where you can be anything, be kind'

match_object = re.search('In',st)
print(f'type is object {match_object}')

type is object <re.Match object; span=(0, 2), match='In'>


In [73]:
match_object1 = re.search('be',st)
print(f'type is object {match_object1}')

type is object <re.Match object; span=(25, 27), match='be'>


In [74]:
match_object.start()

0

In [75]:
match_object1.start()

25

In [76]:
match_object.span()

(0, 2)

In [77]:
match_object1.span()

(25, 27)

In [78]:
source_str  = 'we need to inform him with the latest information'

info = re.search('inform', source_str)
info

<re.Match object; span=(11, 17), match='inform'>

In [79]:
if re.search('inform', source_str):
    print('inform is there')

inform is there


In [None]:
randomstr = 'here is \\kane'

print(randomstr)

re.search(r'\\kane',randomstr)

### split


In [80]:
r = re.split(' ',st)
r

['In', 'a', 'world', 'where', 'you', 'can', 'be', 'anything,', 'be', 'kind']

In [81]:
r1 = re.split('e',st)
r1

['In a world wh', 'r', ' you can b', ' anything, b', ' kind']

In [82]:
# max split
r12 = re.split('e',st,2)
r12 

['In a world wh', 'r', ' you can be anything, be kind']

### sub - substitute

    * sub('old pattern','new pattern',source_str)

In [83]:
sb = re.sub('e','E',st)
sb

'In a world whErE you can bE anything, bE kind'

In [84]:
# max no of occurances to be substituted
sb1 = re.sub('e','E',st,2)
sb1

'In a world whErE you can be anything, be kind'

### Compile

In [85]:
a = 'hat mat rat pat '

reg = re.compile('[r]at')
reg

re.compile(r'[r]at', re.UNICODE)

In [86]:
rplce = reg.sub('FOOD',a)
rplce

'hat mat FOOD pat '

In [87]:
#replacing 

rplc = re.sub('rat','FOOD',a)
rplc

'hat mat FOOD pat '

### working with white spaces

In [88]:
chelsea = '''keep the blue flag
flying high
chelsa
'''
chelsea

'keep the blue flag\nflying high\nchelsa\n'

In [89]:
new_str = re.sub('\n',' ',chelsea)
new_str

'keep the blue flag flying high chelsa '

In [90]:
# other method using compile

comp = re.compile('\n')

new =comp.sub(' ',chelsea)
new

'keep the blue flag flying high chelsa '

* \b : backspace
* \f : formfeed
* \r: carriage return
* \t: tab
* \v:vertical


In [92]:
phone_no = ''' 
444-122-1234
123-122-78999
111-123-23
67-7890-2019
'''
# 3 digit @ start & middle, end -4 digit

reg = re.findall(r'\d{3}\-\d{3}\-\d{4}',phone_no)
reg

['444-122-1234', '123-122-7899']

In [93]:
import re

target_string = "Jessa loves Python and pandas"
# Match six-letter word
pattern = r"\w{6}"

# match() method
result = re.match(pattern, target_string)
print(result)


# search() method
result = re.search(pattern, target_string)
print(result.group()) 


# findall() method
result = re.findall(pattern, target_string)
print(result) 


None
Python
['Python', 'pandas']


In [None]:
# task 36
# regex different examples

# task 37
'''
html
java script
css
anconda install jupyter 
'''