`findall()` find all occurences of a pattern

`search()` returns the first occurrency of a pattern

`sub()` replaces a text in a pattern

`compile()` 

In [2]:
import re

string = '10.5     0.25     12534     std::cout(), std::vector'

print(re.search(r'std', string))

print(re.findall(r'std::', string))

print(re.sub(r'std::', '', string, count=1))
# count parameter is the number of occurrencies that must be replaced
 
pattern = re.compile(r'std::')
print(pattern.search(string))
print(pattern.findall(string))
print(pattern.sub('', string, count=1))


match = pattern.search(string)
print(string[match.start():match.end()])

<re.Match object; span=(28, 31), match='std'>
['std::', 'std::']
10.5     0.25     12534     cout(), std::vector
<re.Match object; span=(28, 33), match='std::'>
['std::', 'std::']
10.5     0.25     12534     cout(), std::vector
std::


'|' (pipe) means 'or'
'.' (point) any character except '\n'
'[]' (brackets) characters set

In [3]:
string = 'std::cout(), Cout()'

print(re.findall(r'.+cout|cout', string))

['std::cout']


In [4]:
print(re.findall(r'[Cc]out', string))

['cout', 'Cout']


In [5]:
new_string = 'cat, bat, rat, Cat, Bat, Rat'
print(re.findall(r'[bc]at', new_string))
print(re.findall(r'[a-z]at', new_string))
print(re.findall(r'[a-zA-Z]at', new_string))
print(re.findall(r'[a-z]at', new_string, flags=re.I))

['cat', 'bat']
['cat', 'bat', 'rat']
['cat', 'bat', 'rat', 'Cat', 'Bat', 'Rat']
['cat', 'bat', 'rat', 'Cat', 'Bat', 'Rat']


## Quantifiers
'*' 0 or more times

'+' 1 or more times

'?' 0 or 1 times

'{n}' n times

'{a,b}' a to b times

'{,n}' 0 to n times

'{n,}' n or more times

In [6]:
print(re.findall(r'.+::', string))
print(re.sub(r'(.+::)*[cC]out', 'cout', string))

['std::']
cout(), cout()


## Greedy quantifiers

'?' information non-greedy / lazy

In [7]:
html_text = '<p>frase1</p> <p></p> <div>frase3</div>'

print(re.findall(r'<[pdiv]{1,3}>.*?<\/[pdiv]{1,3}>', html_text, ))

['<p>frase1</p>', '<p></p>', '<div>frase3</div>']


## Groups and retrovisores

'(text)' finds the exact text
 - accepts quantifiers

'(?:)' dont save the referent group

'(?P< name >)' named group


### Match in the begining or in the end

`^` in the begin

`$` in the end

In [12]:
cpf = '147.852.963-12'
print(re.findall(r'^((?:[0-9]{3}\.){2}[0-9]{3}-[0-9]{2})$', cpf))

['147.852.963-12']


### different than a set of elements 
`[^a-z]` each element different than a-z

In [13]:
print(re.findall(r'[^0-9]+', cpf))

['.', '.', '-']


# Shortranges and flags

`re.I` = ignore case | `re.i`

`re.A` = `re.ASCII` = ignore unicode

`re.M` = multiline (transform ^ and $ into line comparison)

`re.S` = Dotall (ignore line break)

`"\w"` = any character a | `"\W"` negation  

`"\d"` = any digit | `"\D"` negation

`"\s"` = any space | `"\S"` negation

`"\b"` = border | `"\B"` negation



### border examples

In [14]:
#begins with "e"
print(re.findall(r'\b[e]\w+', 'english espanish portuguese'))

['english', 'espanish']


In [20]:
#ends with "e"
print(re.findall(r'\w+[e]\b', 'english espanish portuguese chinese'))

['portuguese', 'chinese']


In [22]:
#begins with p and ends with e
print(re.findall(r'\bp\w+e\b', 'english espanish portuguese'))

['portuguese']


# Lookahead and Lookbehind

positive lookahead verify if there's an expect text in the sequence of the match 

In [43]:
#positive lookahead
ips_connected = '''
ONLINE 192.168.0.1 active
OFFLINE 192.168.0.2 active
ONLINE 192.168.0.3 inactive
OFFLINE 192.168.0.4 active
ONLINE 192.168.0.5 inactive
OFFLINE 192.168.0.6 inactive
'''

print(re.findall(r'(\w+\s+[\.\d]+)\s(?=active)', ips_connected))

['ONLINE 192.168.0.1', 'OFFLINE 192.168.0.2', 'OFFLINE 192.168.0.4']


In [44]:
#negative lookahead
print(re.findall(r'(\w+\s+[\.\d]+)\s(?!active)', ips_connected))

['ONLINE 192.168.0.3', 'ONLINE 192.168.0.5', 'OFFLINE 192.168.0.6']


In [48]:
#positive lookbehind
print(re.findall(r'\w+(?<=ONLINE)\s+[\.\d]+\s+\w+', ips_connected))

['ONLINE 192.168.0.1 active', 'ONLINE 192.168.0.3 inactive', 'ONLINE 192.168.0.5 inactive']


In [50]:
#negative lookbehind
print(re.findall(r'\w+(?<!ONLINE)\s+[\.\d]+\s+\w+', ips_connected))

['OFFLINE 192.168.0.2 active', 'OFFLINE 192.168.0.4 active', 'OFFLINE 192.168.0.6 inactive']


In [49]:
#positive lookbehind
print(re.findall(r'\w+(?<=ONLINE)\s+[\.\d]+\s+(?=active)\w+', ips_connected))

['ONLINE 192.168.0.1 active']
