# Regular Expression (Regex)

Regular expressions are used to identify whether a **pattern** exists in a given sequence of characters (string) or not.

In [1]:
import re

In [21]:
ls = "Computer science, statistics, and maths are all required knowleges in order to become a data scientist. Others are also plus for data science"

### match  
find pattern starting from the very beginning of the string

In [22]:
result_1 = re.match('maths', ls)
result_2 = re.match('Computer', ls)
result_3 = re.match('datascientist', ls)

print("result_1", result_1)
print("result_2", result_2)
print("result_3", result_3)

result_1 None
result_2 <_sre.SRE_Match object; span=(0, 8), match='Computer'>
result_3 None


In [23]:
if result_2:
    print(result_2.group(), 
          result_2.start(), 
          result_2.end(), 
          result_2.span())

Computer 0 8 (0, 8)


### search  
find the pattern that matches the first

In [24]:
result_4 = re.search('maths', ls)
result_5 = re.search('datascientist', ls)
print("result_4 :", result_4)
print("result_5 :", result_5)

result_4 : <_sre.SRE_Match object; span=(34, 39), match='maths'>
result_5 : None


In [25]:
if result_4:
    print(result_4.group(), 
          result_4.start(), 
          result_4.end(), 
          result_4.span())

maths 34 39 (34, 39)


### findall  
find all matching pattern

In [27]:
result_6 = re.findall('are', ls)
result_7 = re.findall('data', ls)
print("result_6", result_6, len(result_6))
print("result_7", result_7, len(result_7))

result_6 ['are', 'are'] 2
result_7 ['data', 'data'] 2


### split

split by pattern  
split string with particular pattern

In [28]:
ls2 = "Computer science, statistics, and maths are all required knowleges in order to become a data scientist."
result_8 = re.split('a', ls2)
print("result_8 : ", result_8)

result_8 :  ['Computer science, st', 'tistics, ', 'nd m', 'ths ', 're ', 'll required knowleges in order to become ', ' d', 't', ' scientist.']


### pattern

some frequently used patterns  

string
- `\d` : digits
- `\D` : nondigits
- `\w` : word characters & underscore `_`
- `\W` : nonword character
- `\s` : whitespace [\t, \n, \r, \f]
- `\S` : nonwhitespace

specifier
- `[]` : set of characters
- `-` : ranging
- `.` : any character
- `?` : 0 or 1 repetition
- `*` : 0 or more repetition
- `+` : 1 or more repetitions
- `{m,n}` : `m` to `n` times repetition
- `()` : grouping
- `|` : or

**string**

In [29]:
import string

In [30]:
# show all printable string
pt = string.printable
len(pt), pt

(100,
 '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c')

In [32]:
# show all digits
digits = re.findall('\d', pt)
print(''.join(digits))

0123456789


In [33]:
# show all nondigits
nondigits = re.findall('\D', pt)
print(''.join(nondigits))

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	



In [34]:
# show all word characters & underscore '_'
words_ = re.findall('\w', pt)
print(''.join(words_))

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_


In [35]:
# show all nonwords characters
nonwords = re.findall('\W', pt)
print(''.join(nonwords))

!"#$%&'()*+,-./:;<=>?@[\]^`{|}~ 	



In [36]:
# show all whitespace
whitespace = re.findall('\s',pt)
print(whitespace)

[' ', '\t', '\n', '\r', '\x0b', '\x0c']


In [38]:
# show all nonwhitespace
nonwhitespaces = re.findall('\S',pt)
print(nonwhitespaces)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


**specifier**

`-` : ranging

In [39]:
# [0-9] : all numbers
numbers = re.findall('[0-9]', pt)
print(''.join(numbers))

0123456789


In [40]:
# [a-z] : all alphabet lowercase
alpha_low = re.findall('[a-z]', pt)
print(''.join(alpha_low))

abcdefghijklmnopqrstuvwxyz


In [41]:
# compare [012345] == [0-5]
result = re.findall('[012345]', pt)
print(''.join(result))
result = re.findall('[0-5]', pt)
print(''.join(result), end="\n\n")

012345
012345



In [42]:
# compare [234789] = [2-47-9]
result = re.findall('[234789]', pt)
print(''.join(result))
result = re.findall('[2-47-9]', pt)
print(''.join(result), end="\n\n")

234789
234789



In [43]:
# compare [bcde] == [bc-e] = [b-e]
result = re.findall('[bcde]', pt)
print(''.join(result))
result = re.findall('[bc-e]', pt)
print(''.join(result))
result = re.findall('[b-e]', pt)
print(''.join(result), end="\n\n")

bcde
bcde
bcde



`.` : one character

In [44]:
ls = ["aab", "a0b", "abc"]
for s in ls:
    result = re.findall('a.b', s)
    # a + one of all characters + b
    print(s, result)

aab ['aab']
a0b ['a0b']
abc []


`?` 0 or 1 repetition

In [45]:
l = ["aab", "a3b", "abc", "accb"]
for s in l:
    result = re.findall('a.?b', s) 
    # a + 0 or 1 of all character + b
    print(s, result)

aab ['aab']
a3b ['a3b']
abc ['ab']
accb []


In [46]:
l = ["aab", "a3b", "a8b", "abc", "accb"]
for s in l:
    result = re.findall('a[0-4]?b', s) 
    # a + 0 or 1 of characters between digits 0-4 + b
    print(s, result)

aab ['ab']
a3b ['a3b']
a8b []
abc ['ab']
accb []


`*` : 0 or more repetitions

In [47]:
l = ["ac","abc","abbbbc","a3bec"]
for s in l:
    result = re.findall('ab*c', s) 
    # a + 0 or more repetitions of 'b' + c
    print(s, result)

ac ['ac']
abc ['abc']
abbbbc ['abbbbc']
a3bec []


`+` : 1 or more repetitions

In [49]:
l = ["ac","abc","abbbbc","a3bec"]
for s in l:
    result = re.findall('ab+c', s) 
    # a + 1 or more repetitions of 'b' + c
    print(s, result)

ac []
abc ['abc']
abbbbc ['abbbbc']
a3bec []


`{m,n}` : m ~ n times repetitions

In [50]:
l = ["ac","abcasd","abbc","abbbc","abbbbbbc"]
for s in l:
    result = re.findall('ab{1,3}c', s) 
    # a + 1~3 times repetitions of 'b' + c
    print(s, result)

ac []
abcasd ['abc']
abbc ['abbc']
abbbc ['abbbc']
abbbbbbc []


`()` : grouping

In [51]:
l = ["aaa5.djfi","abdddc5","1abbbbc","a3.bec"]
for s in l:
    result = re.findall('([0-9]+)[.]([a-z]{2})', s) 
    # 1 or more digits + . + 2 characters
    print(s, result)

aaa5.djfi [('5', 'dj')]
abdddc5 []
1abbbbc []
a3.bec [('3', 'be')]


### `re.compile` 

**result = re.match(pattern, string)** is equivalent to ...

```
prog = re.compile(pattern) 
result = prog.match(string)
```

this is more efficient when the expression will be used several times in a single program

### regex example

**re.search(pattern, string)**

for `\.` is to avoid confusion with `string .` and `regex pattern .` 

In [53]:
s = "My email address is abc123deft@abc.com. My secondary email is hellopython@python.com"
email = "[0-9a-zA-Z]+@[0-9a-z]+\.[0-9a-z]+"
result = True if re.search(email, s) else False
print(result, re.findall(email, s))

True ['abc123deft@abc.com', 'hellopython@python.com']


In [72]:
# email and password
entry_1 = "abc123deft@abc.com."
entry_2 = "!hellopython0147"
email = "[0-9a-zA-Z]+@[0-9a-z]+\.[0-9a-z]+"
pw = "[\S]+"
pw_2 = "*" * len(entry_2)
print("email : ", re.findall(email, entry_1))
print("password : ", re.sub(pw, pw_2, entry_2))
# re.sub(pw, "********", entry_2) # \g<1> : 그룹핑 첫번째 데이터 사용

email :  ['abc123deft@abc.com']
password :  ****************


reference : testing regex  

https://pythex.org/