In [2]:
import re

# 正则表达式的基本技巧

## 需要转义的字符

\$\(\)\*\+\.\?\[\\\^\{\|

匹配成$()*+.?[\^{|

## 匹配多个字符之一

创建一个正则表达式来匹配calendar的所有常见的错误拼写形式

In [3]:
s = "calendar celendar calender calander"
pattern = re.compile(r'c[ae]l[ae]nd[ae]r')
re.findall(pattern,s)

['calendar', 'celendar', 'calender', 'calander']

## 匹配一个被单引号包住的字符

In [16]:
s = "I said 'hi' to him, but he didn't know."
pattern1 = re.compile(r"\s'.*?'\s")  #匹配带单引号'的
re.findall(pattern1,s)   

[" 'hi' "]

In [17]:
pattern2 = re.compile(r"\s'(.*?)'\s")
re.findall(pattern2,s)     #匹配单引号内的内容

['hi']

In [19]:
pattern3 = re.compile(r"'\w+'")
re.findall(pattern3,s)

["'hi'"]

## 匹配文本行起始和/或文本行结尾

^alpha

omega$

## 匹配整个单词

创建一个正则表达式来匹配 My cat is brown 中的cat，但是不会匹配到category或者bobcat，再创建一个正则表达式来匹配staccato中的cat，但是不会匹配到上面的三个字符串

In [34]:
s = " My cat is brown. category and bobcat.  staccato"

In [35]:
pattern1 = re.compile(r'\s(cat)\s')
re.findall(pattern1,s)

['cat']

In [36]:
pattern2 = re.compile(r'\bcat\b')  #单词边界
re.findall(pattern2,s)

['cat']

In [43]:
pattern3 = re.compile(r'\w+\Bcat\w+')  #非单词边界
re.findall(pattern3,s)

['staccato']

In [48]:
pattern4 = re.compile(r'\w+\Bcat\w?')
re.findall(pattern4,s)

['bobcat', 'staccato']

## 匹配多个选择分支之一

创建一个正则表达式，当把它重复应用到目标文本 Mary,Jane,and Sue went to Mary's house 之上时，会匹配到  Mary,Jane,Sue以及又一次匹配到 Mary

In [49]:
s = "Mary,Jane,and Sue went to Mary's house"

In [50]:
pattern = re.compile(r'Mary|Jane|Sue')
re.findall(pattern,s)

['Mary', 'Jane', 'Sue', 'Mary']

In [51]:
s = "Her name is Janet"

In [52]:
pattern = re.compile(r'Jane|Janet')
re.findall(pattern,s)

['Jane']

In [53]:
pattern = re.compile(r'Janet|Jane')
re.findall(pattern,s)

['Janet']

In [54]:
pattern = re.compile(r'\bJanet?\b')
re.findall(pattern,s)

['Janet']

## 分组和捕获匹配中的子串

In [59]:
s = "Mary,Jane,Janet and Sue went to Mary's house"

In [61]:
pattern = re.compile(r'Mary|Jane|Sue')
re.findall(pattern,s)

['Mary', 'Jane', 'Jane', 'Sue', 'Mary']

In [62]:
pattern = re.compile(r'\b(Mary|Jane|Sue)\b')  #改进，只匹配整个单词
re.findall(pattern,s)

['Mary', 'Jane', 'Sue', 'Mary']

创建一个正则表达式，使之匹配任意yyyy-mm-dd格式的日期，并且分别捕获年、月和日，不考虑会出现9999-99-99

In [75]:
s = "I was born on 1995-09-12, and my father was born in 1964, and my mother was born in 1966-08-22."

In [77]:
pattern = re.compile(r'\b\d{4}-\d{2}-\d{2}\b')
re.findall(pattern,s)

['1995-09-12', '1966-08-22']

In [78]:
pattern = re.compile(r'\b\d{4}\b')
re.findall(pattern,s)

['1995', '1964', '1966']

In [79]:
pattern = re.compile(r'-(\d{2})-')
re.findall(pattern,s)

['09', '08']

非捕获分组

In [83]:
s = "Mary,Jane,Janet and Sue went to mary's house"

In [84]:
pattern = re.compile(r'\b(?:Mary|Jane|Sue)\b')
re.findall(pattern,s)

['Mary', 'Jane', 'Sue']

In [85]:
pattern = re.compile(r'\b(?i:Mary|Jane|Sue)\b')  #指定模式修饰符，只有在(?i:)之内的正则表达式部分才是不区分大小写的
re.findall(pattern,s)

['Mary', 'Jane', 'Sue', 'mary']

可以使用一个连字号来关闭修饰符，多个修饰符组合，(?ism:group),关闭三个选项，(?-ism:group),(?i-sm:group)会打开不区分大小写（i），并且同时关闭点号匹配换行符（s）和^和$匹配换行处（m）

## 再次匹配先前匹配的文本

创建一个正则表达式来匹配按照yyyy-mm-dd格式的“神奇”日期，一个神奇日期指的是，如果年减去世纪，月份和该月的天数都是相同的数字，比如2008-08-08

In [86]:
s = "I was born on 2008-08-08, he was born on 2007-07-07, and she was born on 2016-05-06"

可以使用反向引用，使用一个反斜杠后跟一个单个数字（1~9）来引用前9个捕获分组

In [89]:
pattern = re.compile(r'\b\d\d(\d\d)-\1-\1\b')
re.findall(pattern,s)

['08', '07']

## 捕获和命名匹配字串

创建一个正则，匹配以yyyy-mm-dd格式的任意日期，并且分别捕获年、月、日，并向每个捕获的文本添加描述性的名称，year,month,day ； 把神奇日期捕获下来，并给它打上标签magic

In [93]:
s = "I was born on 2008-08-08, and my father was born in 1964, and my mother was born in 1966-08-22."

In [94]:
pattern = re.compile(r'\b(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})\b')
re.findall(pattern,s)

[('2008', '08', '08'), ('1966', '08', '22')]

命名反向引用

In [97]:
pattern = re.compile(r'\b\d{2}(?P<magic>\d{2})-(?P=magic)-(?P=magic)\b')
re.findall(pattern,s)

['08']

## 把正则表达式的一部分重复多次

```
创建一个正则来匹配下列种类的数字：
一个googol（一个100位的十进制数）                       \bd{100}\b
一个32位的十六进制整数                              \b[a-f0-9]{1,8}\b
一个32位的十六进制整数，带有一个可选的h后缀                \b[a-f0-9]{1,8}h?\b
一个浮点数，包含可选的整数部分、必需的小数部分和可选的指数部分，每个部分都允许任意多个数字          \b\d*\.\d+(e\d+)?\b
```

## 重复分组

(?:abc){3}  与 abcabcabc 是相同的

In [98]:
s = "I have 12 apples, he has 1234 apples, and she has 123456 apples"

In [99]:
pattern = re.compile(r'\d\d{2}')
re.findall(pattern,s)

['123', '123', '456']

In [100]:
pattern = re.compile(r'(\d\d){2}') #等同于下面
re.findall(pattern,s)

['34', '34']

In [101]:
pattern = re.compile(r'\d\d(\d\d)') 
re.findall(pattern,s)

['34', '34']

In [102]:
pattern = re.compile(r'(?:\d\d){1,3}') 
re.findall(pattern,s)

['12', '1234', '123456']

## 捕获与非捕获的区别举例

```
要在一篇文章中查找"program"和"project"两个单词，正则表达式可表示为/program|project/,也可表示为/pro(gram|ject)/，
但是缓存子匹配(gramject)没有意义，就可以用/pro(?:gram|ject)/进行非捕获性匹配这样既可以简洁匹配又可不缓存无实际意义的字匹配。
```

## 选择最小和最大重复次数

匹配一对XHTML标记<p>和</p>,以及二者之间的所有文本。在标记之间的文本也可以包含其他XHTML标记

<p>.*?</p>

In [12]:
s = '''<p>The very <em>first</em> task is to find the beeginning of a paragraph.</p>
    <p>Then you have to find the end of the paragragh.</p>
    '''

In [13]:
pattern = re.compile(r'<p>(.*?)</p>')
re.findall(pattern,s)

['The very <em>first</em> task is to find the beeginning of a paragraph.',
 'Then you have to find the end of the paragragh.']

## 贪心和懒惰

\b\d+\b 使用的是贪心量词, \b\d+?\b 使用的是懒惰量词

## 检查一个匹配，但不添加到整体匹配中

找出在一对HTML粗体标记之间的任何单词，但是不要把标记包含到正则表达式匹配中，例如，如果目标文本是 My <b>little cat</b> is furry，那么唯一的匹配应当是little cat

In [5]:
s = 'My <b>little cat</b> is furry'

In [6]:
pattern  = re.compile(r'>(.*?)</')   # 我的方法
re.findall(pattern,s)

['little cat']

## 向正则表达式中添加注释--宽松排列

```
\d{4}    #Year
-        #Separator
\d{2}    #Month
-        #Separator
\d{2}    #Day
```

## 电话号码格式转换

将1234567890 转换为 (123)456-7890

In [14]:
s = "My tel number is 1230987654."

In [17]:
pattern  = re.compile(r'\b(\d{3})(\d{3})(\d{4})\b') 
re.sub(pattern,r'(\1)\2-\3',s)

'My tel number is (123)098-7654.'

# compile第二个参数的含义

```
宽松排列 re.VERBOSE 或 re.X
不区分大小写 re.IGNORECASE 或 re.I
点号匹配换行符 re.DOTALL 或 re.S
脱字符和美元符号匹配换行处 re.MULTILINE 或re.M
```

# 其他实例

## 获取匹配文本的一部分

In [18]:
s = "Please visit http://www.regexcookbook.com for more infomation."

In [20]:
pattern = re.compile(r"http://([a-z0-9.-]+)")
re.findall(pattern,s)

['www.regexcookbook.com']

In [21]:
pattern.findall(s)

['www.regexcookbook.com']

## 获取所有匹配的列表

In [22]:
s = "The lucky numbers are 7,13,16,42,65 and 99."

In [24]:
pattern = re.compile(r"\d+")
re.findall(pattern,s)

['7', '13', '16', '42', '65', '99']

## 保留13倍数的数字

In [25]:
s = "1,2,3,4,5,6,7,8,9,11,13,14,17,18,20,24,26,36,39,50,52."

In [27]:
lst = []
for obj in re.finditer(r"\d+",s):
    if int(obj.group()) % 13 == 0:
        lst.append(obj.group())

In [28]:
lst

['13', '26', '39', '52']

In [31]:
lst = []
pattern= re.compile(r"\d+")
for obj in pattern.finditer(s):
    if int(obj.group()) % 13==0:
        lst.append(obj.group())

In [32]:
lst

['13', '26', '39', '52']

## 在另一个匹配中查找匹配

处理字符串<b>2</b>3 4<b>5 6 7</b> 的时候，找到2、5、6、7

In [41]:
s = "<b>2</b>3 4<b>5 6 7</b>"

In [66]:
lst = []
inner = re.compile(r"\d+")
for outer in re.finditer("<b>(.*?)</b>",s):
    lst.extend(inner.findall(outer.group()))    
lst

['2', '5', '6', '7']

(?s)即Singleline(单行模式)。表示更改.的含义，使它与每一个字符匹配（包括换行 符\n）

## (review)替换另一个正则匹配中的所有匹配

假设有一个html文件，其中有不同的段落使用<b>tag被标记为粗体。在每对粗体tag之间，想要把before的所有匹配替换为after。例如当处理字符串
before<b>first before</b> before <b>before before</b> 的时候，最终得到 before<b>first after</b> before <b>after after</b>

In [71]:
s = "before<b>first before</b> before <b>before before</b>"

In [77]:
inner = re.compile(r"before")
def replacewithin(obj):
    return inner.sub("after",obj.group())
result = re.sub("<b>.*?</b>",replacewithin,s)
result

'before<b>first after</b> before <b>after after</b>'

## 拆分字符串

把I like<b>bold</b>and<i>italic</i>fonts 拆分后得到I like、bold、and、italic、fonts

In [81]:
s = "I like<b>bold</b>and<i>italic</i>fonts"

In [83]:
pattern = re.compile(r"\b<.*?>\b")
pattern.split(s)

['I like', 'bold', 'and', 'italic', 'fonts']

In [84]:
pattern = re.compile(r"(\b<.*?>\b)")
pattern.split(s)

['I like', '<b>', 'bold', '</b>', 'and', '<i>', 'italic', '</i>', 'fonts']

## 逐行查找

处理一个字符串数组或一个多行字符串

In [85]:
s = '''Stop all the clocks, cut off the telephone, 
Prevent the dog from barking with a juicy bone,
Silence the pianos and with muffled drum Bring out the coffin, 
let the mourners come.
'''

In [95]:
lines = re.split("\r?\n",s)
pattern = re.compile(r"[A-Z][a-z]*")
for line in lines:
    if re.search(line,s):
        print ("match")
    else:
        print ("not match")

match
match
match
match
match
match


# 合法性验证和格式化

## (review)E-mail地址的合法性验证

In [123]:
s = '''
1029113880@qq.com
jenny_jiacheng@sina.cn
monty_python@163.com
xue.ji@yuyidata.com
.183920@qq.com
1930 @687.com
_jenny_@
'''

In [124]:
pattern =re.compile(r"[a-zA-Z0-9-._]+@[a-zA-Z0-9-_]+\.[a-zA-Z0-9-_]+")
re.findall(pattern,s)

['1029113880@qq.com',
 'jenny_jiacheng@sina.cn',
 'monty_python@163.com',
 'xue.ji@yuyidata.com',
 '.183920@qq.com']

## 北美电话号码的合法性验证和格式化

In [53]:
s = "13524096875  (345)823-7890 234.456.7890  456 123 7890 (123)456 7890 (123)456-7890"

In [54]:
pattern = re.compile(r"\(\d{3}\)\d{3}-\d{4}")
re.findall(pattern,s)

['(345)823-7890', '(123)456-7890']

In [55]:
pattern = re.compile(r"\b(\d{3})(.*?)(\d{3})(.*?)(\d{4})\b")
re.sub(pattern,r"(\1)\3-\5",s)

'(135)240-6875  ((345)823-7890 (234)456-7890  (456)123-7890 ((123)456-7890 ((123)456-7890'

```
北美电话编号基本规则：
区号以2~9开头，第二位0~8，第三位可以是任意数字
第二组中的3位数字，以2~9开头，后面两位可以是任意数字
最后4位可以不加限制的使用任何数字
```

In [56]:
pattern = re.compile(r"\([2-9][0-8]\d\)[2-9]\d{2}-\d{4}")
re.findall(pattern,s)

['(345)823-7890']

## 传统日期格式的合法性验证

In [69]:
s ="12/24/2016  16/16/2110  20/12/2016  09/36/2016  01/01/9999"  #mm/dd/yyyy

In [80]:
pattern = re.compile(r"^(1[0-2]|0?[1-9])/(3[01]|[12][0-9]|0?[1-9])/([0-9]{2}[0-9]{2})")
re.findall(pattern,s)

[('12', '24', '2016')]