# Automate the Boring Stuff with Python
# Chapter 05 Dictionaries & Structuring Data

In [46]:
# Why We Need to Use Raw Strings
print(r'\n')    # prints \n, which can be read by Regex parser
print('\n')     # prints a new line

\n




In [14]:
# Finding Patterns of Text without Regular Expression
# Will involve A LOT of if-else statements.

def isPhoneNumber(text):
    if len(text) != 12:
        return False
    for i in range(0, 3):
        if not text[i].isdecimal():
            return False
    if text[3] != '-':
        return False
    for i in range(4, 7):
        if not text[i].isdecimal():
            return False
    if text[7] != '-':
        return False
    for i in range(8, 12):
        if not text[i].isdecimal():
            return False
    return True

print(isPhoneNumber('415-555-4242'))

True


In [15]:
message = 'Call me at 415-555-1911 tomorrow. 415-555-9999 is my office.'
for i in range(len(message)):
    chunk = message[i:i+12]
    if isPhoneNumber(chunk):
        print('Phone number found: ' + chunk)
print('done')

Phone number found: 415-555-1911
Phone number found: 415-555-9999
done


In [16]:
# Find Patterns of Text with Regular Expressions

import re

In [17]:
phoneNumRegex = re.compile(r'\d{3}-\d{3}-\d{4}') 
mo = phoneNumRegex.search(message)
print('Phone number found: ' + mo.group(0))

Phone number found: 415-555-1911


In [18]:
# Grouping with Parentheses
phoneNumRegex = re.compile(r'(\d{3})-(\d{3}-\d{4})') 
mo = phoneNumRegex.search(message)
print('Area code found: ' + mo.group(1))
print('Number found: ' + mo.group(2))

Area code found: 415
Number found: 555-1911


In [19]:
# Difference between group(), group(0) and groups()

print(mo.group())
print(mo.group(0))
print(mo.groups())

415-555-1911
415-555-1911
('415', '555-1911')


In [20]:
# Search only returns the first found instance
# (by the way, Pipe allows an OR search)
batman_first = 'Batman and Tina Fey'
tina_first = 'Tina Fey and Batman'
heroRegex = re.compile(r'Batman|Tina Fey')

print( heroRegex.search(batman_first).group() )
print( heroRegex.search(tina_first).group() )

Batman
Tina Fey


In [21]:
# The .findall() Method

message = 'Call me at 415-555-1911 tomorrow. 415-555-9999 is my office.'
phoneNumRegex = re.compile(r'\d{3}-\d{3}-\d{4}') 
mo = phoneNumRegex.findall(message)

print(mo)

['415-555-1911', '415-555-9999']


In [22]:
# Matching New Lines with the Dot Character

noNewLineRegex = re.compile(r'.*')

print( noNewLineRegex.search('Line One. \n Line Two. \n Line Three. \n ').group() )

print('==============')

newLineRegex = re.compile(r'.*', re.DOTALL)

print( newLineRegex.search('Line One. \n Line Two. \n Line Three. \n ').group() )

Line One. 
Line One. 
 Line Two. 
 Line Three. 
 


In [23]:
# Making the Regular expression object ignore case (match big or small letters)

robocop = re.compile(r'robocop', re.I) # re.I same as re.IGNORECASE

print( robocop.search('RoboCop is part man, part machine, all cop.').group() )

print( robocop.search('ROBOCOP protects the innocent').group() )

print( robocop.search('Al, why does your programming book talk about robocop so much.').group() )

RoboCop
ROBOCOP
robocop


In [24]:
# Substituting Strings with the .sub() Method

namesRegex = re.compile(r'Agent \w+')
namesRegex.sub('CENSORED', 'Agent Alice gave the secret documents to Agent Bob.')

'CENSORED gave the secret documents to CENSORED.'

In [25]:
# More Discrete Substitution

namesRegex = re.compile(r'Agent (\w)\w+')
namesRegex.sub(r'\1****', 'Agent Alice gave the secret documents to Agent Bob.')

namesRegex = re.compile(r'(Agent \w)\w+(\w)')
namesRegex.sub(r'\1**\2', 'Agent Alice gave the secret documents to Agent Bob.')

'Agent A**e gave the secret documents to Agent B**b.'

In [42]:
# Managing Complex Regexes 

phoneRegex = re.compile(r''' 
    (\d{3}|\(\d{3}\))               # area code
    (\s|\-|\.)                      # separator
    (\d{3})                           # first 3 digits
    (\s|\-|\.)                      # separator
    (\d{4})                           # last 4 digits
    (\s*(ext|x|ext.)\s:\d{2,5})?    # extension
    ''', re.VERBOSE)

print( phoneRegex.search(message).group() )

415-555-1911


In [38]:
# Dummy Data for the following parts

text = '''The 2020 Summer Olympics (サマーオリンピック) is an upcoming international multi-sport event scheduled to take place from 23 July 2020 to 8 August 2021 in Tokyo, Japan. Originally due to take place from 24 July 2020 to 9 August 2020, the event was postponed in March 2020 as a result of the COVID-19 pandemic. Despite being rescheduled for 2021, the Games have retained the "Tokyo 2020" name for marketing and branding purposes. This marks the first time that the Olympic Games have been postponed rather than cancelled altogether. 

Tokyo was selected as the host city during the 125th IOC Session in Buenos Aires, Argentina, on 7 September 2013. The 2020 Games will mark the second time that Japan—and specifically Tokyo—has hosted the Summer Olympic Games, the first being in 1964, making it the first city in Asia to host the Summer Olympics twice. Overall, these will be the fourth Olympic Games to be held in Japan, which also hosted the Winter Olympics in 1972 (Sapporo) and 1998 (Nagano). The 2020 Games will also be the second of three consecutive Olympics to be held in East Asia, the first being in Pyeongchang County, South Korea in 2018, and the next in Beijing, China in 2022. 

For enquiries, call 山田太郎 at 050-234-5678 or 070-765-4321 or email at Yamada.Taro_2020@oly2020.co.jp for more details! ありがとうございます！'''

In [31]:
emailRegex = re.compile(r'''([\w\.\_]+  # First part of address
                            @           # at sign
                            [\w\.]+\.co\.jp)''', re.IGNORECASE | re.DOTALL | re.VERBOSE)

print( emailRegex.search(text).group() )

Yamada.Taro_2020@oly2020.co.jp


In [32]:
print(type(phoneRegex))

<class '_sre.SRE_Pattern'>


In [44]:
for number in phoneRegex.findall(text):
    print(''.join(number))

050-234-5678
070-765-4321


In [45]:
for email in emailRegex.findall(text):
    print(email)

Yamada.Taro_2020@oly2020.co.jp


In [50]:
output = phoneRegex.findall(text)

print(type(output))

for num in output:
    print( ''.join(num) )

<class 'list'>
050-234-5678
070-765-4321


In [51]:
output = phoneRegex.search(text)

print(type(output))

print(output.group() )

<class '_sre.SRE_Match'>
050-234-5678


1. What is the function that creates Regex objects?

    - re.compile(regex)

2. Why are raw strings often used when creating Regex objects?

    - Regex uses a lot of backslashes, which would need to be escaped in order to be read properly, but we can bypass that using raw strings.

3. What does the search() method return?

    - it returns a Match object.

4. How do you get the actual strings that match the pattern from a Match object?

    - You run the method match.groups() which returns a list or match.group() which returns a string

5. In the regex created from r'(\d\d\d)-(\d\d\d-\d\d\d\d)', what does group 0 cover? Group 1? Group 2?

    - group 0 covers everything, in this case both group 1 and 2

6. Parentheses and periods have specific meanings in regular expression syntax. How would you specify that you want a regex to match actual parentheses and period characters?

    - you use escape characters, such as \( and \.

7. The findall() method returns a list of strings or a list of tuples of strings. What makes it return one or the other?

    - it depends if parentheses were used to create groupings in the regex

8. What does the | character signify in regular expressions?

    - the pipe signifies OR

9. What two things does the ? character signify in regular expressions?

    - it signifies non-greedy option when used after star or plus
    - it signifies optional item when used after other characters

10. What is the difference between the + and * characters in regular expressions?

    - + is one or more, while * is zero or more

11. What is the difference between {3} and {3,5} in regular expressions?

    - 3 times repetitions vs 3-5 times repetitions

12. What do the \d, \w, and \s shorthand character classes signify in regular expressions?

    - digits, alphanumeric characters, and whitespace (incl. newline)

13. What do the \D, \W, and \S shorthand character classes signify in regular expressions?

    - non-digits, non-alphanumeric characters, and whitespace (incl. newline)

14. What is the difference between .* and .*??

    - wildcard (zero to infinite except newline) vs wildcard (at least once)

15. What is the character class syntax to match all numbers and lowercase letters?

    - [a-z0-9]

16. How do you make a regular expression case-insensitive?

    re.compile(regex, re.IGNORECASE)

17. What does the . character normally match? What does it match if re.DOTALL is passed as the second argument to re.compile()?

    - all except newline
    - all including newline

18. If numRegex = re.compile(r'\d+'), what will numRegex.sub('X', '12 drummers, 11 pipers, five rings, 3 hens') return?

    - 'X drummers, X pipers, five rings, X hens'

19. What does passing re.VERBOSE as the second argument to re.compile() allow you to do?

    - allows you to ignore whitespace and comments in your regex

20. How would you write a regex that matches a number with commas for every three digits? It must match the following:
'42'
'1,234'
'6,368,745'
but not the following:
'12,34,567' (which has only two digits between the commas)
'1234' (which lacks commas)

    - ^(\d{1,3}(,\d{3})*)[^,0-9]


21. How would you write a regex that matches the full name of someone whose last name is Watanabe? You can assume that the first name that comes before it will always be one word that begins with a capital letter. The regex must match the following:
'Haruto Watanabe'
'Alice Watanabe'
'RoboCop Watanabe'
but not the following:
'haruto Watanabe' (where the first name is not capitalized)
'Mr. Watanabe' (where the preceding word has a nonletter character)
'Watanabe' (which has no first name)
'Haruto watanabe' (where Watanabe is not capitalized)

    - [A-Z]\w+ Watanabe

22. How would you write a regex that matches a sentence where the first word is either Alice, Bob, or Carol; the second word is either eats, pets, or throws; the third word is apples, cats, or baseballs; and the sentence ends with a period? This regex should be case-insensitive. It must match the following:
'Alice eats apples.'
'Bob pets cats.'
'Carol throws baseballs.'
'Alice throws Apples.'
'BOB EATS CATS.'
but not the following:
'RoboCop eats apples.'
'ALICE THROWS FOOTBALLS.'
'Carol eats 7 cats.'

    - re.compile( r'(Alice|Bob|Carol) (eats|pets|throws) (apples|cats|baseball)', re.IGNORECASE )