# Chapter 7 模式匹配与正则表达式

## 7.1 不用正则表达式来查找文本模式

In [1]:
def isPhoneNumber(text):
    if len(text)!=12:
        return False
    for i in range(0,3):
        if not text[i].isdecimal():
            return False
    if text[3]!='-':
        return False
    for i in range(4,7):
        if not text[i].isdecimal():
            return False
        if text[7]!='-':
            return False
        for i in range(8,12):
            if not text[i].isdecimal():
                return False
    return True

In [5]:
print('415-555-4242 is a phone number: ') 
isPhoneNumber('415-555-4242')

415-555-4242 is a phone number: 


True

In [7]:
message='Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'
for i in range(len(message)):
    chunk=message[i:i+12]
    if isPhoneNumber(chunk):
        print('Phone number found: '+chunk)
print('Done')

Phone number found: 415-555-1011
Phone number found: 415-555-9999
Done


## 7.2 用正则表达式查找文本模式

### 7.2.1 创建正则表达式对象

In [2]:
import re

In [11]:
phoneNumRegex=re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

### 7.2.2 匹配Regex对象

In [None]:
mo=phoneNumRegex.search('My number is 415-555-4242')
print('Phone number found: '+mo.group())

### 7.2.3 正则表达式复习

1.用import re导入正则表达式模块
2.用re.compile()函数创建一个Regex对象
3.向Regex对象的search（）方法传入想查找的字符串，将返回Match对象
4.调用Match对象的group（）方法，返回实际匹配文本的字符串

## 7.3 用正则表达式匹配更多模式

### 7.3.1 利用括号分组

In [18]:
phoneNumRegex=re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')
mo=phoneNumRegex.search('My number is 415-555-4242.')
mo.group(2)

'555-4242'

In [17]:
mo.groups()

('415', '555-4242')

In [19]:
phoneNumRegex=re.compile(r'(\(\d\d\d\))(\d\d\d-\d\d\d\d)')
mo=phoneNumRegex.search('My phone number is (415)555-4242')
mo.group(1)

'(415)'

In [20]:
mo.group(2)

'555-4242'

### 7.3.2. 用管道匹配多个分组

In [21]:
heroRegex=re.compile(r'Batman|Tina Fey')
mo1=heroRegex.search('Batman and Tina Fey.')
mo1.group()

'Batman'

In [22]:
batRegex=re.compile(r'Bat(man|mobile|copter|bat)')
mo=batRegex.search('Batmobile lost a wheel')
mo.group()

'Batmobile'

In [23]:
mo.group(1)

'mobile'

### 7.3.3 用问号实现可选匹配

In [25]:
batRegex=re.compile(r'Bat(wo)?man')
mo1=batRegex.search('The Adventures of Batman.')
mo1.group()

'Batman'

In [27]:
mo2=batRegex.search('The Adventure of Batwoman.')
mo2.group()

'Batwoman'

In [30]:
phoneRegex=re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
mo1=phoneRegex.search('My number is 415-555-4242.')
mo1.group()

'415-555-4242'

In [32]:
mo2=phoneRegex.search('My number is 555-4242')
mo2.group()

'555-4242'

### 7.3.4 用星号匹配零次或一次

In [33]:
batRegex=re.compile(r'Bat(wo)*man')
mo1=batRegex.search('The Adventure of Batman.')
mo1.group()

'Batman'

In [34]:
mo2=batRegex.search('The Adventure of Batwoman.')
mo2.group()

'Batwoman'

In [36]:
mo3=batRegex.search('The Adventure of Batwowowowoman')
mo3.group()

'Batwowowowoman'

### 7.3.5 用加号匹配一次或多次

In [37]:
batRegex=re.compile(r'Bat(wo)+man')
mo1=batRegex.search('The Adventures of Batwoman')
mo1.group()

'Batwoman'

In [38]:
mo2=batRegex.search('The Adventure of Batwowowowoman')
mo2.group()

'Batwowowowoman'

In [39]:
mo3=batRegex.search('The Adventure of Batman.')
mo3==None

True

### 7.3.6 用花括号匹配特定次数

In [40]:
haRegex=re.compile(r'(Ha){3}')
mo1=haRegex.search('HaHaHa')
mo1.group()

'HaHaHa'

In [41]:
mo2=haRegex.search('Ha')
mo2==None

True

In [43]:
haRegex=re.compile(r'(Ha){1,3}')
mo1=haRegex.search('HaHaHa')
mo1.group()

'HaHaHa'

## 7.4 贪心和非贪心匹配

In [44]:
greedyHaRegex=re.compile(r'(Ha){3,5}')
mo1=greedyHaRegex.search('HaHaHaHaHa')
mo1.group()

'HaHaHaHaHa'

In [45]:
nongreedyHaRegex=re.compile(r'(Ha){3,5}?')
mo2=nongreedyHaRegex.search('HaHaHaHaHa')
mo2.group()

'HaHaHa'

## 7.5 findall()方法

In [48]:
phoneNumRegex=re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo=phoneNumRegex.search('Cell:415-555-9999 Work:212-555-0000')
mo.group()

'415-555-9999'

In [49]:
phoneNumRegex.findall('Cell:415-555-9999 Work:212-555-0000')

['415-555-9999', '212-555-0000']

In [50]:
phoneNumRegex=re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)')
phoneNumRegex.findall('Cell:415-555-9999 Work:212-555-0000')

[('415', '555', '9999'), ('212', '555', '0000')]

## 7.6 字符分类

\d 0-9任何数字
\D 除0-9数字外的其他任何字符
\w 任何字母、数字或下划线字符（可认为是匹配单词字符串）
\W 除字母、数字和下划线以外的任何字符
\s 空格、制表符或换行符（可认为是匹配‘空白’字符串）
\S 除空格、制表符或换行符以外的任何字符

In [3]:
xmasRegex=re.compile(r'\d+\s\w+')
xmasRegex.findall('12 drummers, 11 pipers, 10 lords, 9 ladies, 8 maids, 7 swans, 6 geese, 5 rings, 4 birds, 3 hens, 2 doves, 1 partridge')

['12 drummers',
 '11 pipers',
 '10 lords',
 '9 ladies',
 '8 maids',
 '7 swans',
 '6 geese',
 '5 rings',
 '4 birds',
 '3 hens',
 '2 doves',
 '1 partridge']

## 7.7 建立自己的字符分类

In [4]:
vowelRegex=re.compile(r'[aeiouAEIOU]')
vowelRegex.findall('RoboCop eats baby food. BABY FOOD.')

['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o', 'A', 'O', 'O']

In [5]:
constantRegex=re.compile(r'[^aeiouAEIOU]')
constantRegex.findall('RoboCop eats baby food. BABY FOOD.')

['R',
 'b',
 'C',
 'p',
 ' ',
 't',
 's',
 ' ',
 'b',
 'b',
 'y',
 ' ',
 'f',
 'd',
 '.',
 ' ',
 'B',
 'B',
 'Y',
 ' ',
 'F',
 'D',
 '.']

## 7.8 插入字符和美元字符

In [7]:
beginsWithHello=re.compile(r'^Hello')
beginsWithHello.search('Hello world!').group()

'Hello'

In [8]:
beginsWithHello.search('He said hello.')==None

True

In [10]:
wholeStringIsNum=re.compile(r'^\d+$')
wholeStringIsNum.search('1234567890').group()

'1234567890'

In [11]:
wholeStringIsNum.search('12345xyz67890')==None

True

In [12]:
wholeStringIsNum.search('12 34567890')==None

True

## 7.9 通配字符

In [13]:
atRegex=re.compile(r'.at')  #.匹配除换行之外所有字符
atRegex.findall('The cat in the hat sat on the flat mat.')

['cat', 'hat', 'sat', 'lat', 'mat']

### 7.9.1 用点-星匹配所有字符

In [14]:
nameRegex=re.compile(r'First Name: (.*) Last Name:(.*)')
mo=nameRegex.search('First Name: Al Last Name: Sweigart')
mo.group(1)

'Al'

In [15]:
mo.group(2)

' Sweigart'

In [16]:
mo.group()

'First Name: Al Last Name: Sweigart'

In [17]:
nongreedyRegex=re.compile(r'<.*?>')
mo=nongreedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man>'

In [18]:
greedyRegex=re.compile(r'<.*>')
mo=greedyRegex.search('<To serve man> for dinner.>')
mo.group()

'<To serve man> for dinner.>'

### 7.9.2 用句点字符匹配换行

In [3]:
noNewlinRegex=re.compile('.*')
noNewlinRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law').group()

'Serve the public trust.'

In [4]:
newlineRegex=re.compile('.*', re.DOTALL)
newlineRegex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

'Serve the public trust.\nProtect the innocent.\nUphold the law.'

## 7.11 不区分大小写的匹配

In [None]:
regex1=re.compile('RoboCop')
regex2=re.compile('ROBOCOP')
regex3=re.compile('robocop')
regex4=re.compile('RobocOp')

In [5]:
robocop=re.compile(r'robocop',re.I)
robocop.search('RoboCop is part man, part machine, all cop.').group()

'RoboCop'

In [6]:
robocop.search('ROBOCOP protects the innocent.').group()

'ROBOCOP'

In [7]:
robocop.search('A1, why does your programming book talk about robocop so much?').group()

'robocop'

## 7.12 用sub（）方法替换字符串

In [9]:
namesRegex=re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [13]:
agentNamesRegex=re.compile(r'Agent (\w)\w*')
agentNamesRegex.sub(r'\1****','Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.')

'A**** told C**** that E**** knew B**** was a double agent.'

## 7.13 管理复杂的正则表达式

In [14]:
phoneRegex=re.compile(r'''(
(\d{3}|(\d{3}))?
(\s|-|\.)?
\d{3}
(\s|-|\.)
\d{4}
(\s*(ext|x|ext.)\s*\d{2,5})?
)''',re.VERBOSE)

## 7.14 组合使用re.IGNORECASE、re.DOTALL和re.VERBOSE

In [15]:
someRegexValue=re.compile('foo', re.IGNORECASE|re.DOTALL)

In [16]:
someRegexValue=re.compile('foo',re.IGNORECASE|re.DOTALL|re.VERBOSE)

## 7.15 项目：电话号码和Email地址提取程序

### 7.15.1 为电话号码创建正则表达式

In [6]:
import pyperclip, re

phoneRegex=re.compile(r'''(
(\d{3}|\(d{3}\))?
(\s|-|\.)?
(\d{3})
(\s|-|\.)
(\d{4})
(\s*(ext|x|ext.)\s*(\d{2,5}))?
)''',re.VERBOSE)

### 7.15.2 为Email地址创建一个正则表达式

In [7]:
emailRegex=re.compile(r'''(
[a-zA-Z0-9._%+-]+
@
[a-zA-Z0-9.-]+
(\.[a-zA-Z]{2,4})
)''',re.VERBOSE)

### 7.15.3 在剪贴板文本中找到所有匹配

In [11]:
text=str(pyperclip.paste())
matches=[]
for groups in phoneRegex.findall(text):
    phoneNum='-'.join([groups[1],groups[3],groups[5]])
    if groups[8]!='':
        phoneNum+=' x'+group[8]
    matches.append(phoneNum)
for groups in emailRegex.findall(text):
    matches.append(groups[0])

### 7.15.4 所有匹配连接成一个字符串，复制到剪贴板

In [12]:
if len(matches)>0:
    pyperclip.copy('\n'.join(matches))
    print('Copied to clipboard')
    print('\n'.join(matches))
else:
    print('No phone numbers or email addresses found.')

Copied to clipboard
800-420-7240
415-863-9900
415-863-9950
info@nostarch.com
media@nostarch.com
academic@nostarch.com
info@nostarch.com
