# ch07 Pattern Matching with Regular Expressions

In [2]:
import re

In [3]:
>>> phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
>>> mo = phoneNumRegex.search('My number is 415-555-4242.')
>>> print('Phone number found: ' + mo.group())

Phone number found: 415-555-4242


In [4]:
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')

In [5]:
mo = phoneNumRegex.search('My number is 415-555-4242.')

In [6]:
mo.group(1)

'415'

In [8]:
mo.group(2)

'555-4242'

In [7]:
mo.group(0)

'415-555-4242'

In [9]:
mo.group()

'415-555-4242'

In [10]:
mo.groups()

('415', '555-4242')

In [11]:
areaCode, mainNumber = mo.groups()
print areaCode, mainNumber

415 555-4242


In [12]:
phoneNumRegex = re.compile(r'(\(\d\d\d\)) (\d\d\d-\d\d\d\d)')

In [13]:
mo = phoneNumRegex.search('My phone number is (415) 555-4242.')

In [14]:
mo.group(1)

'(415)'

In [15]:
mo.group(2)

'555-4242'

## Matching Multiple Groups with the Pipe

In [16]:
>>> heroRegex = re.compile (r'Batman|Tina Fey')
>>> mo1 = heroRegex.search('Batman and Tina Fey.')
>>> mo1.group()

'Batman'

In [17]:
>>> mo2 = heroRegex.search('Tina Fey and Batman.')
>>> mo2.group()

'Tina Fey'

In [18]:
>>> batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
>>> mo = batRegex.search('Batmobile lost a wheel')
>>> mo.group()

'Batmobile'

In [19]:
>>> mo.group(1)

'mobile'

In [21]:
mo.group(0)

'Batmobile'

## Optional Matching with the Question Mark

In [23]:
>>> batRegex = re.compile(r'Bat(wo)?man')
>>> mo1 = batRegex.search('The Adventures of Batman')
>>> mo1.group()

'Batman'

In [24]:
>>> mo2 = batRegex.search('The Adventures of Batwoman')
>>> mo2.group()

'Batwoman'

In [25]:
>>> phoneRegex = re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d')
>>> mo1 = phoneRegex.search('My number is 415-555-4242')
>>> mo1.group()

'415-555-4242'

In [26]:
>>> mo2 = phoneRegex.search('My number is 555-4242')
>>> mo2.group()

'555-4242'

## Greedy and Nongreedy Matching

In [27]:
>>> greedyHaRegex = re.compile(r'(Ha){3,5}')
>>> mo1 = greedyHaRegex.search('HaHaHaHaHa')
>>> mo1.group()

'HaHaHaHaHa'

In [28]:
>>> nongreedyHaRegex = re.compile(r'(Ha){3,5}?')
>>> mo2 = nongreedyHaRegex.search('HaHaHaHaHa')
>>> mo2.group()

'HaHaHa'

## Project: Phone Number and Email Address Extractor

### Step 1: Create a Regex for Phone Numbers

In [None]:
import pyperclip, re

phoneRegex = re.compile(r'''(
    (\d{3}|\(\d{3}\))?    # area code
    (\s|-|\.)?            # separator
    (\d{3})               # first 3 digits
    (\s|-|\.)             # separator
    (\d{4})               # last 4 digits
    (\s*(ext)|x|ext.)\s*(\d{2,5}))?   # extension
    )''', re.VERBOSE)

# TODO: Create email regex.

# TODO: Find maches in clipboard text.

# TODO: Copy results to the clipboard.

### Step 2: Create a Regex for Email Addresses

In [None]:
#! python3
# phoneAndEmail.py - Finds phone numbers and email addresses on the clipboard.
import pyperclip, re

phoneRegex = re.compile(r'''(
--snip--
'''

# TODO: Create email regex.
emailRegex = re.compile(r'''(
    [a-zA-Z0-9._%+-]+    # username
    @                    # @ symbol
    [a-zA-Z0-9.]+        # domain name
    (\.[a-zA-Z]{2,4})    # dot-something
    )''', re.VERBOSE)

# TODO: Find maches in clipboard text.

# TODO: Copy results to the clipboard.

### Step 3: Find All Maches in the Clipboard Text

In [None]:
#! python3
# phoneAndEmail.py - Finds phone numbers and email addresses on the clipboard.

import pyperclip, re

phoneRegex = re.compile(r'''(
--snip--
''')

# Find matches in clipboard text.
text = str(pyperclip.paste())
matches = []
for groups in phoneRegex.findall(text):
    phoneNum = '-'.join([groups[1], groups[3], groups[5]])
    if groups[8] != '':
        phoneNum += ' x' + groups[8]
    matches.append(phoneNum)
for groups in emailRegex.findall(text):
    matches.append(groups[0])
    
# TODO: Copy results to the clipboard.

### Step 4: Join the Matches into a String for the Clipboard

In [None]:
# Copy results to the clipboard.
if len(matches) > 0:
    pyperclip.copy('\n'.join(matches))
    print('Copied to clipboard:')
    print('\n'.join(matches))
else:
    print('No phone numbers or email addresses found.')

### Total

In [52]:
import pyperclip, re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

phoneRegex = re.compile(r'''(
    (\d{3}|\(\d{3}\))?                # area code
    (\s|-|\.)?                        # separator
    (\d{3})                           # first 3 digits
    (\s|-|\.)                         # separator
    (\d{4})                           # last 4 digits
    (\s*(ext|x|ext.)\s*(\d{2,5}))?    # extension
    )''', re.VERBOSE)


# TODO: Create email regex.
emailRegex = re.compile(r'''(
    [a-zA-Z0-9._%+-]+    # username
    @                    # @ symbol
    [a-zA-Z0-9.]+        # domain name
    (\.[a-zA-Z]{2,4})    # dot-something
    )''', re.VERBOSE)

# Find matches in clipboard text.
text = str(pyperclip.paste())
print text
matches = []
for groups in phoneRegex.findall(text):
    phoneNum = '-'.join([groups[1], groups[3], groups[5]])
    if groups[8] != '':
        phoneNum += ' x' + groups[8]
    matches.append(phoneNum)
for groups in emailRegex.findall(text):
    matches.append(groups[0])
    
# Copy results to the clipboard.
if len(matches) > 0:
    pyperclip.copy('\n'.join(matches))
    print('Copied to clipboard:')
    print('\n'.join(matches))
else:
    print('No phone numbers or email addresses found.')

In [53]:
print text

In [54]:
text

'\t\nContact Us\nNo Starch Press, Inc.\n245 8th Street\nSan Francisco, CA 94103 USA\nPhone: 800.420.7240 or +1 415.863.9900 (9 a.m. to 5 p.m., M-F, PST)\nFax: +1 415.863.9950\n\nReach Us by Email\nGeneral inquiries: info@nostarch.com\nMedia requests: media@nostarch.com\nAcademic requests: academic@nostarch.com (Please see this page for academic review requests)\nHelp with your order: help@nostarch.com\nReach Us on Social Media\nTwitter\nFacebook'

In [55]:
matches

['800-420-7240',
 '415-863-9900',
 '415-863-9950',
 'info@nostarch.com',
 'media@nostarch.com',
 'academic@nostarch.com',
 'help@nostarch.com']

#### Ctrl + v를 해보면 이렇게 나옴

    800-420-7240
    415-863-9900
    415-863-9950
    info@nostarch.com
    media@nostarch.com
    academic@nostarch.com
    help@nostarch.com

In [9]:
s = '''800-420-7240
415-863-9900
415-863-9950
info@nostarch.com
media@nostarch.com
academic@nostarch.com
help@nostarch.com'''

In [16]:
print s

800-420-7240
415-863-9900
415-863-9950
info@nostarch.com
media@nostarch.com
academic@nostarch.com
help@nostarch.com


## Practice Questions

#### Q20. How would you write a regex that matches a number with commas for every three digits? It must matche the following:

- '42'
- '1,234'
- '6,368,745'

but not the following:
- '12,34,567' (which has only two digits between the commas)
- '1234' (which lacks commas)

In [30]:
import re

In [31]:
def add_comma_in_digits(s):
    comma_regex = re.compile(r'(?<=\d)(?=(\d\d\d)+(?!\d))')
    return comma_regex.sub(',', s)

In [32]:
add_comma_in_digits('12')

'12'

In [33]:
add_comma_in_digits('123')

'123'

In [34]:
add_comma_in_digits('1234')

'1,234'

In [35]:
add_comma_in_digits('12345')

'12,345'

In [36]:
add_comma_in_digits('123456')

'123,456'

In [37]:
add_comma_in_digits('6368745')

'6,368,745'

### Q21. How would you write a regex that matches the full name of someone whose last name is Nakamoto? You can assume that the first name that comes before it will always be one word that begins with a capital letter. The regex must matche the following:

- 'Satoshi Nakamoto'
- 'Alice Nakamoto'
- Robocop Nakamoto'

but not the following:

- 'satoshi Nakamoto' (where the first name is not capitalized)
- 'Mr. Nakamoto' (where the preceding word has a nonletter character)
- 'nakamoto' (which has no first name)
- 'Satoshi nakamoto' (where Nakamoto is not capitalized)

In [43]:
def match_capital_letter(s):
    if re.search(r'^[A-Z][a-z]+\s+[A-Z][a-z]+', s):
        return s
    raise("didn't match")

In [44]:
match_capital_letter('Satoshi Nakamoto')

'Satoshi Nakamoto'

In [45]:
match_capital_letter('Alice Nakamoto')

'Alice Nakamoto'

In [46]:
match_capital_letter('Robocop Nakamoto')

'Robocop Nakamoto'

In [47]:
match_capital_letter('satoshi Nakamoto')

TypeError: exceptions must be old-style classes or derived from BaseException, not str

In [48]:
match_capital_letter('Mr. Nakamoto')

TypeError: exceptions must be old-style classes or derived from BaseException, not str

In [49]:
match_capital_letter('Nakamoto')

TypeError: exceptions must be old-style classes or derived from BaseException, not str

In [50]:
match_capital_letter('Satoshi nakamoto')

TypeError: exceptions must be old-style classes or derived from BaseException, not str

### Q22. How would you write a regex that matches a sentence where the first word is either Alice, Bob, Carol; the second word is either eats, pets, or throws; the third word is apples, cats, or baseballs; and the sentence ends with a period? This regex should be case-insensitive. It must matche the following:

- 'Alice eats apples.'
- 'Bob pets cats.'
- 'Carol throws baseballs.'
- 'Alice throws Apples.'
- 'BOB EATS CATS.'

but not the following:

- 'Robocop eats apples.'
- 'ALICE THROWS FOOTBALLS.'
- 'Carol eats 7 cats.'

In [51]:
'Alice eats apples.'.split()

['Alice', 'eats', 'apples.']

In [52]:
first = ['alice', 'bob', 'carol']
second = ['eats', 'pets', 'pets']
third = ['apples', 'cats', 'baseballs']

first_regex = '|'.join(first)

In [61]:
first_regex = re.compile(r'|'.join(first), re.I)
second_regex = re.compile(r'|'.join(second), re.I)
third_regex = re.compile(r'|'.join(third), re.I)
four_regex = re.compile(r'\.$')

In [62]:
first_regex

re.compile(r'alice|bob|carol', re.IGNORECASE)

In [58]:
second_regex

re.compile(r'eats|pets|pets', re.IGNORECASE)

In [65]:
all_ = [first, second, third]
all_

[['alice', 'bob', 'carol'],
 ['eats', 'pets', 'pets'],
 ['apples', 'cats', 'baseballs']]

In [66]:
'\s'.join(all_)

TypeError: sequence item 0: expected string, list found

In [69]:
first2 = '|'.join(first)

In [70]:
first2

'alice|bob|carol'

In [71]:
second2 = '|'.join(second)

In [72]:
second2

'eats|pets|pets'

In [73]:
third2 = '|'.join(third)

'apples|cats|baseballs'

In [74]:
third2

'apples|cats|baseballs'

In [84]:
a = [[first2] + [second2] + [third2]]
a

[['alice|bob|carol', 'eats|pets|pets', 'apples|cats|baseballs']]

In [88]:
a

[['alice|bob|carol', 'eats|pets|pets', 'apples|cats|baseballs']]

In [92]:
for i in [first2+second2+third2]:
    print i

alice|bob|caroleats|pets|petsapples|cats|baseballs


In [87]:
a2 = []
for i in a:
    a2.append(i)
    a2.append('\s')
a2

[['alice|bob|carol', 'eats|pets|pets', 'apples|cats|baseballs'], '\\s']

In [80]:
'\s'.join(a)

TypeError: sequence item 0: expected string, list found

In [68]:
all_ = first + second
all_

['alice', 'bob', 'carol', 'eats', 'pets', 'pets']

#### 특정한 상황에 맞는 문장 찾기 Logic

1. 문장을 space 기준으로 분리한다.
2. 각 문장 순서에 맞는 정규식을 작성한다.
3. for문을 돌면서 맞는지 확인한다.


In [139]:
def matches_specificed_sentence(s):
    lsts = s.split()
    first = ['alice', 'bob', 'carol']
    second = ['eats', 'pets', 'throws']
    third = ['apples', 'cats', 'baseballs']
    
    first_regex = re.compile(r'|'.join(first), re.I)
    second_regex = re.compile(r'|'.join(second), re.I)
    third_regex = re.compile(r'|'.join(third), re.I)
    four_regex = re.compile(r'\.$')
    
    for i, word in enumerate(lsts):
        if i == 0 and not first_regex.search(word):
            return False
        if i == 1 and not second_regex.search(word):
            return False
        if i == 2 and not third_regex.search(word):
            return False
        if i == 2 and not four_regex.search(word):
            return False
    return s

In [140]:
ll = ['Alice eats apples.',
'Bob pets cats.',
'Carol throws baseballs.',
'Alice throws Apples.',
'BOB EATS CATS.',]
ll

['Alice eats apples.',
 'Bob pets cats.',
 'Carol throws baseballs.',
 'Alice throws Apples.',
 'BOB EATS CATS.']

In [141]:
for lst in ll:
    print matches_specificed_sentence(lst)

Alice eats apples.
Bob pets cats.
Carol throws baseballs.
Alice throws Apples.
BOB EATS CATS.


In [142]:
ll2 = ['Robocop eats apples.',
'ALICE THROWS FOOTBALLS.',
'Carol eats 7 cats.',]
ll2

['Robocop eats apples.', 'ALICE THROWS FOOTBALLS.', 'Carol eats 7 cats.']

In [143]:
for lst in ll2:
    print matches_specificed_sentence(lst)

False
False
False


## Practice Projects

### Strong Password Detection

1. 최소 8글자
2. 대문자, 소문자 둘 다 포함
3. 최소한 1개의 숫자 포함
4. 최소한 1개의 특수문자 포함

In [144]:
import string

In [146]:
string.lowercase

'abcdefghijklmnopqrstuvwxyz'

In [147]:
string.uppercase

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [193]:
# \\] 이 부분 조심. re.compile에 넣을 때 r이 없다는 기준으로 2개가 생겼네.
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [148]:
text = 'niceGood'

In [177]:
text2 = 'nN3dac##!'

In [179]:
re.search(r'.{8,}', text2)

<_sre.SRE_Match at 0x1122a1370>

In [191]:
def is_strong_password(s):
    # 프로그래밍적으로도 풀 수 있지만 여기는 regex 연습하는 곳이니 regex만으로 문제 해결
    lenth_regex = re.compile(r'.{8,}')
    upper_regex = re.compile(r'[ABCDEFGHIJKLMNOPQRSTUVWXYZ]')
    lower_regex = re.compile(r'[abcdefghijklmnopqrstuvwxyz]')
    digit_regex = re.compile(r'[0123456789]')
    punctuation_regex = re.compile(r'[!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]')
    if lenth_regex.search(s) and \
        upper_regex.search(s) and \
        lower_regex.search(s) and \
        digit_regex.search(s) and \
        punctuation_regex.search(s):
            return s
    return False

In [192]:
is_strong_password('nN3dac##!')

'nN3dac##!'

In [194]:
is_strong_password('nN3dacjj')

False