## Regular Expression Syntax

### \d This includes [0-9], and also many other digit characters. If the ASCII flag is used only [0-9] is matched.
If you need to match a phone number, "\d{11}" will help

In [4]:
res = re.findall(r"\d{11}", text)
res

['12345678910', '15136985214']

### `re.match` can be used be validate format

In [8]:
email = input("email: ")
is_valid = re.match(r"^\w+@\w+.\w+$", email)
if is_valid:
    print(f"valid email: {email}")
else:
    print(f"invalid email")

email:  ...


invalid email


## 1. about characters

### fixed string

In [9]:
text = "wupeiqi hello world wupeiqi"
res = re.findall(r"wupeiqi", text)
res

['wupeiqi', 'wupeiqi']

### dynamic string [a-z], [0-9]

In [11]:
text = "wupeiqia hello world wupeiqib"
res = re.findall(r"wupeiqi[ai]", text)  #'wupeiqia', 'wupeiqii'
res

['wupeiqia']

In [12]:
res = re.findall(r"wupeiqi[a-z]", text)  #'wupeiqia', 'wupeiqib' ... 'wupeiqiz'
res

['wupeiqia', 'wupeiqib']

### \d refers to one digit

In [14]:
text = "hello 123"
res = re.findall(r"\d", text)  #
res

['1', '2', '3']

In [15]:
res = re.findall(r"\d\d", text)  #
res

['12']

In [18]:
res = re.findall(r"\d\d", "hello 1234")  #
res

['12', '34']

In [17]:
res = re.findall(r"\d{3}", text)  #
res

['123']

In [20]:
res = re.findall(r"\d{2,}", text)  #
res

['123']

In [24]:
res = re.findall(r"t\d{2,}", "t1234 t576 t43")  # 2, means no less than 2
res

['t1234', 't576', 't43']

In [26]:
res = re.findall(r"t\d{2,4}", "t1234564 t576 t43")  # 2,4 means  2 <= <=4
res

['t1234', 't576', 't43']

### match a Chinese mobilephone number precisely

In [29]:
res = re.findall(r"1[358]\d{9}", "1369874520013698745200")  # 2,4 means  2 <= <=4
res

['13698745200', '13698745200']

### ? stands for 0 or 1 character

In [33]:
res = re.findall(r"t\d?", "t t1 t12")
res

['t', 't1', 't1']

### + stands for 1 or n characters

In [34]:
res = re.findall(r"t\d+", "t t1 t12")
res

['t1', 't12']

### * stands for 0 or n characters

In [35]:
res = re.findall(r"t\d*", "t t1 t12")
res

['t', 't1', 't12']

### \w stands for letters, digits, underscore, and Chinese characters, etc, excluding space

### \w this includes alphanumeric characters (as defined by str.isalnum()) as well as the underscore (_). If the ASCII flag is used, only [a-zA-Z0-9_] is matched.
For example, if you want to match an email address, \w+@\w+.\w+ will help. 

In [48]:
text = "This message includes two email addresses, hello_world_19@greatwall.com and 123_456@math.com, and two Chinese mobilephone numbers: 12345678910 and 15136985214."
import re
res = re.findall(r"\w+@\w+.\w+", text, re.ASCII) # re.ASCII excludes Chinese characters
res

['hello_world_19@greatwall.com', '123_456@math.com']

In [36]:
text = "阿仕顿， 阿s顿"
re.findall(r"阿\w顿", text)

['阿仕顿', '阿s顿']

In [38]:
text = "阿仕ss顿， 阿s顿"
re.findall(r"阿\w{2,10}顿", text)

['阿仕ss顿']

### `.` includes any character, except newline

In [46]:
text = "阿仕ss 顿， 阿s顿"
print(re.findall(r"阿\w{2,10}顿", text))

re.findall(r"阿.{1,4}顿", text)

[]


['阿仕ss 顿', '阿s顿']

In [47]:
text = "阿仕ss 顿， 阿.顿"
print(re.findall(r"阿\.顿", text))

['阿.顿']


### quantity 
- `?` 0 or 1
- `*` 0 or n
- `+` 1 or n
- `{n}` n
- `{n,m}` n<= <=m
- `{n,}` n<=

Note that they match in a greedy way. If you do not want them to work in a greedy way, please add `?` right after them.

In [51]:
text = "roorao"
re.findall(r"r.+o", text)

['roorao']

In [52]:
text = "roorao"
re.findall(r"r.+?o", text)

['roo', 'rao']

## Groups via ()
### 1. extract data

In [54]:
text = "15332165458 15332165458 "
re.findall(r"153\d{2}(\d{3}\d{3})", text)

['165458', '165458']

In [55]:
text = "15332165458 15332165458 "
re.findall(r"(153\d{2}(\d{3}\d{3}))", text)

[('15332165458', '165458'), ('15332165458', '165458')]

In [56]:
text = "40000019991231000X"
re.findall(r"\d{6}(\d{4})\d{7}[\dX]", text)

['1999']

### 2. `()` can be used to perform `|` operation

In [58]:
text = "40000019991231000X, 400000abc31231000X"
re.findall(r"\d{6}(\d{4}|abc\d)\d{7}[\dX]", text)

['1999', 'abc3']

### `^` Matches the start of the string; `$` Matches the end of the string

In [60]:
text = "123"
text1 = "a123"
print(re.findall(r"^\d{3}$", text))
print(re.findall(r"^\d{3}$", text1))

['123']
[]


## `re.match`  matches from the begining and returns the first result

In [70]:
text = "123"
text1 = "a123"
text2 = "123b"
res = re.match(r"\d{3}", text)
res1 = re.match(r"\d{3}", text1)
res2 = re.match(r"\d{3}$", text2)
if res:
    print(res.group())
if res1:
    print(res1.group())
if res2:
    print(res2.group())

123


## `re.search`  searches in the whole text and returns the first result

In [73]:
text = "123"
text1 = "a123"
text2 = "123b"
res = re.search(r"\d{3}", text)
res1 = re.search(r"\d{3}", text1)
res2 = re.search(r"\d{3}$", text2)
if res:
    print(f'{text}', res.group())
if res1:
    print(f'{text1}', res1.group())
if res2:
    print(f'{text2}', res2.group())

123 123
a123 123


## `re.split`  works as `text.split()`

In [74]:
text = "123, 345"
text1 = "a123, dd, aa.gg"

res = re.split(r"[,]", text)
res1 = re.split(r"[,\.]", text1)
print(res, res1)

['123', ' 345'] ['a123', ' dd', ' aa', 'gg']


In [77]:
15.68*1.791

28.08288

In [78]:
15.68*0.491

7.69888

In [85]:
7.2*(1-0.6560)

2.4768

In [86]:
7.2*0.297

2.1384

In [8]:
import re
pt_dir = '2023-05-25-11-21-57-BCE-guide-10-0.1-0.4-upd-50-200'
# re.findall(r"(CE|CE[-_]BCE|BCE|BCE[-_]BCE|MAE|BCE[-_]guide|MAE[-_]BCE)", pt_dir)
re.findall(r"(CE|guide|BCE|MAE)", pt_dir, re.IGNORECASE)

['BCE', 'guide']

In [11]:
file_ls = ['zzzzzzzzzzzzzzzzz', 'yyyyyyyyyyyyy']
f_ls = [f for f in file_ls]
print(f_ls.pop(0))
print(f_ls.pop(0))
print(file_ls)

zzzzzzzzzzzzzzzzz
yyyyyyyyyyyyy
['zzzzzzzzzzzzzzzzz', 'yyyyyyyyyyyyy']
