In [1]:
import re

* `findall`	Returns a list containing all matches
* `search`	Returns a Match object if there is a match anywhere in the string
* `split`	Returns a list where the string has been split at each match
* `sub`	    Replaces one or many matches with a string

In [31]:
txt = "The rain in Spain"

**search**

In [32]:
re.search("^The.*Spain$", txt)

<re.Match object; span=(0, 17), match='The rain in Spain'>

In [33]:
re.search("\s", txt)

<re.Match object; span=(3, 4), match=' '>

In [53]:
re.search("ai", txt)

<re.Match object; span=(5, 7), match='ai'>

In [56]:
re.search(r"\bS\w+", txt)

<re.Match object; span=(12, 17), match='Spain'>

In [102]:
txt[12:17]

'Spain'

**span-start-end**

In [62]:
# \b Matches the boundary (or empty string) at the start and end of a word

re.search(r"\bS\w+", txt).span()

(12, 17)

In [58]:
re.search(r"\bS\w+", txt).start(), re.search(r"\bS\w+", txt).end()

(12, 17)

**findall**

In [35]:
re.findall("ai", txt)

['ai', 'ai']

In [11]:
re.findall("Portugal", txt)

[]

In [174]:
t = "a-a------abbccA_"
re.findall("\w*", t)

['a', '', 'a', '', '', '', '', '', '', 'abbccA_', '']

**split**

In [40]:
re.split("\s", txt)  # txt.split()

['The', 'rain', 'in', 'Spain']

In [45]:
re.split("\s", txt, 1)

['The', 'rain in Spain']

**sub**

In [51]:
re.sub("\s", "_", txt)

'The_rain_in_Spain'

In [52]:
re.sub("\s", "*", txt, 2)

'The*rain*in Spain'

**string**

In [68]:
x = re.search(r"\bS\w+", txt)
print(x.string)

The rain in Spain


**group**

In [115]:
re.search(r"\w+n\b", txt).group()

'rain'

In [113]:
re.search(r"\s?in", txt).group()

'in'

In [112]:
re.search(r"\w+n$", txt).group()

'Spain'

**compile & match**

In [119]:
h = re.compile('hello')
h.match('hello world')

<re.Match object; span=(0, 5), match='hello'>

In [123]:
re.match('hello', 'hello world')

<re.Match object; span=(0, 5), match='hello'>

In [140]:
a = "AB01"
m = re.compile(r"([A-Z]{2})(\s?_?\s?)([0-9]{2})")  # note raw string
g = m.match(a)
if g:
    print(1,g.group()) # For 0 or blank returns the entire match.
    print(2,g.group(0)) # For 0 returns the entire match.
    print(3,g.group(1))
    print(4,g.group(2))
    print(5,g.group(3))    

1 AB01
2 AB01
3 AB
4 
5 01


In [160]:
txt='Strider is a sick little puppy \nhttp://apps.facebook.com/dogbook/profile/view/5248435\
\n@mangaaa I hope they will increase the capacity fast, yesterday was such a pain.'

In [161]:
re.match(r"\bS.+k\b",txt)

<re.Match object; span=(0, 17), match='Strider is a sick'>

In [162]:
pattern = r"\bS.+k\b"
w = re.compile(pattern)
print(w.match(txt).group())
print(w.match(txt).span())

Strider is a sick
(0, 17)


In [163]:
txt[0:17]

'Strider is a sick'

In [164]:
for tweet in txt.split("\n"):
    # Write regex to match http links and print out result
    if re.findall("http\S+", tweet):
        print(re.findall("http\S+", tweet))
        
    #Write regex to match user mentions and print out result
    if re.findall("@\w+", tweet): 
        print(re.findall("@\w+", tweet))

['http://apps.facebook.com/dogbook/profile/view/5248435']
['@mangaaa']


**find dates**

In [165]:
sentiment_analysis = ['I would like to apologize for the repeated Video Games Live related tweets. 32 minutes ago',
                     '@zaydia but i cant figure out how to get there / back / pay for a hotel 1stY0_  2019',
                     'FML: So much for seniority, bc of technological ineptness 23rd June 2018 17:54']

In [171]:
for date in sentiment_analysis:
    print(re.findall("\d{1,2}\w*\s[a-zA-Z0-9_]+\s\d{4}", date)) #[a-zA-Z0-9_] = \w

[]
[]
['23rd June 2018']


In [172]:
re.findall("\d{1,2}\w*\s\w+\s\d{4}", ' '.join(sentiment_analysis))

['23rd June 2018']

In [173]:
re.findall("\d{1,2}\w*\s\w+\s\d{4}\s\d{2}:\d{2}", ' '.join(sentiment_analysis))

['23rd June 2018 17:54']

**email**

In [182]:
emails = ['n.john.smith@gmail.com', '87victory@hotmail.com', '!#mary-=@msca.com']
regex = "[a-zA-Z.0-9\W]+@\w+\.com"

for email in emails:
    if re.match(regex, email): print("{} is VALID".format(email))       
    else: print("{} is INVALID".format(email)) 

n.john.smith@gmail.com is VALID
87victory@hotmail.com is VALID
!#mary-=@msca.com is VALID


**passwords**
* passwords containing uppercase, lowercase, punctuation and number are valid, otherwise invalid.

In [186]:
from string import punctuation as punc
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [253]:
passwords = ['Apple34!.', 'Myhou52#$', '1qaz.2WSX@','@secreT2020','Abc123','aK2.','Aa0?']
regex = ["[A-Z]+","[a-z]+","[0-9]+","[#$%!@&.]+","[a-zA-Z0-9*#$%!@&.]{8,20}"]

g,r,b='\033[92m', '\033[91m','\033[94m'
for password in passwords:
    
    if all([1 if re.search(i, password) else 0 for i in regex]): 
        print(f"{g}{password} is a VALID")
    else: 
        print(f"{r}{password} is INVALID") 

[92mApple34!. is a VALID
[92mMyhou52#$ is a VALID
[92m1qaz.2WSX@ is a VALID
[92m@secreT2020 is a VALID
[91mAbc123 is INVALID
[91maK2. is INVALID
[91mAa0? is INVALID


**lazy & greedy**
* The ? makes the + "lazy" instead of "greedy". This means it tries to match as few times as possible, instead of trying to match as many times as possible.

In [272]:
sentiment_analysis = "Was intending to finish editing my 53888886-page novel manuscript tonight,\
(but I forget the name of the site.) And only 12 pages are left. (I'm crying)"

In [273]:
re.findall("\d+?", sentiment_analysis)

['5', '3', '8', '8', '8', '8', '8', '6', '1', '2']

In [277]:
re.findall("\d+", sentiment_analysis) #lazy

['53888886', '12']

In [278]:
re.findall("\(.+\)", sentiment_analysis)

["(but I forget the name of the site.) And only 12 pages are left. (I'm crying)"]

In [279]:
re.findall("\(.+?\)", sentiment_analysis) #lazy

['(but I forget the name of the site.)', "(I'm crying)"]

**eliminate HTML tags**
* The ? makes the + "lazy" instead of "greedy". This means it tries to match as few times as possible, instead of trying to match as many times as possible.

In [256]:
string = 'I want to see that <strong>amazing show</strong> again!'
string

'I want to see that <strong>amazing show</strong> again!'

In [261]:
re.sub("<.+>", "", string) # greedy

'I want to see that  again!'

In [263]:
re.sub("<.+?>", "", string) # lazy

'I want to see that amazing show again!'