In [None]:
# is phone number without regex (text patterns)
def isPhoneNumber(text):
    if len(text) !=12:
        return False # not phone number-sized
    for i in range(0,3):
        if not text[i].isdecimal():
            return False # no area code
    if text[3] !='-':
        return False # missing dash
    for i in range(4,7):
        if not text[i].isdecimal():
            return False # no first 3 digits
        if text[7] !='-':
            return False # missing second dash
        for i in range(8,12):
            if not text[i].isdecimal():
                return False # missing last 4 digits
            return True

In [None]:
print(isPhoneNumber('415-555-1234'))
print(isPhoneNumber('hello'))

In [None]:
message = 'Call me 415-555-1011 tomorrow, or at 415-555-9999 for my office line'
foundNumber=False
for i in range(len(message)):
  chunk=message[i:i+12]
  if isPhoneNumber(chunk):
    print('Phone number found: ' + chunk)
    foundNumber=True
if not foundNumber:
  print('Could not find any phone numbers.')

In [None]:
# Regular Expression Basics (like a mini lannguage for specifying text patterns)
import re

In [None]:
message = 'Call me 415-555-1011 tomorrow, or at 415-555-9999 for my office line'

# pass r for raw strings followed by the pattern we're looking for.  use backslash to preface the patterns
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

# since the regex data type has a search method, we'll use that... which returns a match object
mo=phoneNumRegex.search(message)

# match objects have a method called group which will tell you the actual text
print(mo.group())

# use the findall method to print a list of all phone number patterns found
print(phoneNumRegex.findall(message))

In [None]:
# steps:
    # call the re.compile() function to create a regex object
    # call the regex object's search() method to create a match object
    # call the match object's group() method to get the matched string

import re
phoneNumRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
mo = phoneNumRegex.search('My number is 415-555-4242')
mo.group()

# we use parenthesis to mark out groups
phoneNumRegex = re.compile(r'(\d\d\d)-(\d\d\d-\d\d\d\d)')  # mark out two groups: area code & phone number
mo=phoneNumRegex.search('My number is 415-555-4242')
print(mo.group())
print(mo.group(1)) # print the area code
print(mo.group(2)) # print the phone number

In [None]:
phoneNumRegex = re.compile(r'\(\d\d\d\) \d\d\d-\d\d\d\d')
mo = phoneNumRegex.search('My number is (415) 555-4242')
mo.group()

In [None]:
# Regex and the pipe character

batRegex=re.compile(r'Bat(man|mobile|copter|bat)')
mo=batRegex.search('Batmobile lost a while')
print(mo.group())
print(mo.group(1))

In [None]:
# Repitition in Regex Patterns and Greedy/Nongreedy Matching (greedy

import re

# batRegex=re.compile(r'Batman|Batwoman')

batRegex=re.compile(r'Bat(wo)?man')  # the pattern (wo) indicates that the wo group can appear 0 or 1 times in order to match the pattern
mo=batRegex.search('The adventures of Batman')
type(mo)
mo=batRegex.search('The adventures of Batwoman')
if not mo==None:
  print(mo.group())

In [None]:
# you can apply to phone numbers that do or do not have the area code
# phoneRegex=re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
# phoneRegex.search('My phone number is 415-555-1234. Call me tomorrow.')
phoneRegex=re.compile(r'(\d\d\d-)?\d\d\d-\d\d\d\d') # here, we put the first three numbers and dash in a group that can either appear 0 or 1 times
# phoneRegex.search('My phone number is 415-555-1234. Call me tomorrow.')
mo=phoneRegex.search('My phone number is 555-1234. Call me tomorrow.')
if not mo==None:
    print(mo.group())

In [None]:
# ? means 0 or 1 times
# * means any number of times
# + means 1 or more times
batRegex=re.compile(r'Bat(wo)*man')
mo=batRegex.search('The adventures of Batwowowoman')
if not mo==None:
    print(mo.group())

In [None]:
regex=re.compile(r'\+\*\?') # if you want to reference literal + * ?, then preface with backslash
mo=regex.search('I learned about +*? regex syntax')
if not mo==None:
    print(mo.group())

In [None]:
regex=re.compile(r'(\+\*\?)+') # put in a group and return if it's found 1 or more times
mo=regex.search('I learned about +*?+*?+*? regex syntax')
if not mo==None:
    print(mo.group())

In [None]:
# match exact number of expressions in a group, using curly bracket
haRegex=re.compile(r'(Ha){3}')
mo=haRegex.search('He said "HaHaHa"')
if not mo==None:
    print(mo.group())

In [None]:
# find exactly 3 phone numbers in a row separated by a comma (and the comma is optional)... and may or may not include the area code
phoneRegex=re.compile(r'((\d\d\d-)?\d\d\d-\d\d\d\d(,)?){3}')
mo=phoneRegex.search('My numbers are: 415-999-1234,415-888-2345,777-3456')
if not mo==None:
    print(mo.group())

In [None]:
# match if it finds Ha between 3-5 times
haRegex=re.compile(r'(Ha){3,5}')
mo=haRegex.search('He said "HaHaHaHa"')
if not mo==None:
    print(mo.group())

In [None]:
# match if it finds Ha 3 or more times
haRegex=re.compile(r'(Ha){3,}')
mo=haRegex.search('He said "HaHaHaHa"')
if not mo==None:
    print(mo.group())

In [None]:
# regular expressions in python, by default, do greedy matches (tries to match the longest possible string that matches the pattern)

digitRegex=re.compile(r'(\d){3,5}') # match any 3, 4 or 5 digits
mo=digitRegex.search('1234567890')
if not mo==None:
    print(mo.group())

In [None]:
# regular expressions in python, by default, do greedy matches (tries to match the longest possible string that matches the pattern)
# but below, if you add a question mark, it has a different meaning... it will do a non greedy match... and match the smallest possible string that matches the pattern.

digitRegex=re.compile(r'(\d){3,5}?') # match any 3, 4 or 5 digits
mo=digitRegex.search('1234567890')
if not mo==None:
    print(mo.group())

In [None]:
# Regex Character Classes and the findall() Method

import re
phoneRegex=re.compile(r'\d\d\d-\d\d\d\-\d\d\d\d')
mo=phoneRegex.findall('My number is 212-755-7441') # findall method returns a list of strings
print(mo)

In [None]:
# findall method returns a list of tuples that have strings
import re
phoneRegex=re.compile(r'(\d\d\d)-(\d\d\d\-\d\d\d\d)')
mo=phoneRegex.findall('My numbers are 212-755-7441 and 917-881-6172 and 212-758-7202')
print(mo)

In [None]:
# add another level of grouping
import re
phoneRegex=re.compile(r'((\d\d\d)-(\d\d\d\-\d\d\d\d))')
mo=phoneRegex.findall('My numbers are 212-755-7441 and 917-881-6172 and 212-758-7202')
print(mo)

In [None]:
# Character classes
# the two lines below accomplish the same thing
# digitRegex=re.compile(r'(0|1|2|3|4|5|6|7|8|9)')
# digitRegex=re.compile(r'\d') # shortcut

# \d   any numeric digit from 0 to 9
# \D   any character that is not a numeric digit from 0 to 9
# \w   any letter, numeric digit, or the underscore character (think of this as matching "word" characters)
# \s   any space, tab or newline character (think of this as matching "space" characters)
# \S   any character that is not a space, tab or newline

# Example

lyrics='12 drummers drumming, 11 pipers piping, 10 lords a leaping, 9 ladies dancing, 8 maids a milking, 7 swans a swimming, and 1 partridge in a pear tree'
xmasRegex=re.compile(r'\d+\s\w+')
xmasRegex.findall(lyrics)

In [None]:
# create your own character classes
vowelRegex=re.compile(r'[a-fA-F]')      # all lowercase & uppercase letters between a-f
vowelRegex=re.compile(r'[aeiouAEIOU]')    # r'(a|e|i|o|u|A|E|I|O|U)'
vowelRegex.findall('Robocop eats baby food')

In [None]:
# character class with two vowels in a row
vowelRegex=re.compile(r'[aeiouAEIOU]{2}')
vowelRegex.findall('Robocop eats baby food')

In [None]:
# character class with non vowels
consonantsRegex=re.compile(r'[^aeiouAEIOU]')
consonantsRegex.findall('Robocop eats baby food')

In [None]:
# with the ^ search for a word at the beginning of a string
beginsWithHelloRegex = re.compile(r'^Hello')
beginsWithHelloRegex.search('Hello there!')
beginsWithHelloRegex.search('He said Hello there!') == None

In [None]:
# with the $ search for a word at the beginning of a string
beginsWithHelloRegex = re.compile(r'world!$')
beginsWithHelloRegex.search('Hello world!')
beginsWithHelloRegex.search('Hello world! Nick') == None

In [None]:
# pattern must match the entire string
allDigitsRegex = re.compile(r'^\d+$')
allDigitsRegex.search('1234234234879234') == None
allDigitsRegex.search('123423c4234879234') == None

In [None]:
# string with contain anything (wildcard .) followed by "at"
atRegex = re.compile(r'.at')
atRegex.findall('The cat in the hat sat on the flat mat.')

In [None]:
atRegex = re.compile(r'.{1,2}at')
atRegex.findall('The cat in the hat sat on the flat mat.')

In [None]:
# .* is in greedy mode.  it will always try to find as much text as possible
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
nameRegex.findall('First Name: Al Last Name: Sweigart')

In [None]:
# .*? is in non-greedy mode.  it will always try to find but for as little text as possible
serve = '<To serve humans> for dinner.>'
nongreedy = re.compile(r'<(.*?)>')
nongreedy.findall(serve)

In [None]:
# .* is in greedy mode.  it will always try to find as much text as possible... in this case, to much text
serve = '<To serve humans> for dinner.>'
greedy = re.compile(r'<(.*)>')
greedy.findall(serve)

In [None]:
# stops at the end of the first line... as the default behavior for .* is to stop at the end of the line
prime = 'Serve the public trust.\nProtect the innocent.\nUpload the law.'
dotStar = re.compile(r'.*')
dotStar.search(prime)

In [None]:
# handles all lines... via DOTALL property
prime = 'Serve the public trust.\nProtect the innocent.\nUpload the law.'
dotStar = re.compile(r'.*',re.DOTALL)
dotStar.search(prime)

In [None]:
vowelRegex=re.compile(r'[aeiou]')
vowelRegex.findall('Al, why does your programming book talk about RoboCop so much?')

In [None]:
vowelRegex=re.compile(r'[aeiou]',re.IGNORECASE)  # or re.I
vowelRegex.findall('Al, why does your programming book talk about RoboCop so much?')

In [None]:
# Regex sub() Method and Verbose Mode

namesRegex = re.compile(r'Agent \w+')
namesRegex.findall('Agent Alice gave the secret documents to Agent Bob.')
namesRegex.sub('REDACTED','Agent Alice gave the secret documents to Agent Bob.')

In [None]:
namesRegex = re.compile(r'Agent (\w)\w*')
namesRegex.findall('Agent Alice gave the secret documents to Agent Bob.')
namesRegex.sub(r'Agent \1****','Agent Alice gave the secret documents to Agent Bob.')

In [None]:
# verbose mode format - more readable

re.compile(r'''
(\d\d\d-)|                  # area code (without parens, with dash)
(\(\d\d\d\) )               # -or- area code with parens and no dash
-                           # first dash
\d\d\d                      # first three digits
-                           # second dash
\d\d\d\d                    # last four digits
\sx\d{2,4}                  # extension, like x1234''',re.VERBOSE)
# you can use:  re.IGNORECASE | re.DOTALL | re.VERBOSE

In [None]:
# a regex phone and email scraper
# to do:
    # create a regex object for phone numbers
    # create a regex object for email addresses
    # get the text off the clipboard
    # extract the email/phone from this text
    # copy the extracted email/phone to the clipboard

In [None]:
import re, pyperclip

In [None]:
# create a regex for phone numbers
phoneRegex = re.compile(r'''
# 415-555-0000, 555-0000, (415) 555-0000, 555-0000 ext 12345, ext. 12345, x12345

(
((\d\d\d)|(\(\d\d\d\)))?       # area code (optional)
(\s|-)                         # first separator
\d\d\d                         # first 3 digits
-                              # separator
\d\d\d\d                       # last 4 digits
(((ext(\.)?\s)|x)              # extension word-part (optional)
 (\d{2-5}))?                   # extension number-part (optional)
)
''',re.VERBOSE)

In [None]:
# create a regex object for email addresses
emailRegex = re.compile(r'''
# some.+_thing@something.com

[a-zA-Z0-9_.+]+        # name part (create our own character class)
@                      # @ symbol
[a-zA-Z0-9_.+]+        # domain name part
''', re.VERBOSE)

In [None]:
print(text)

In [137]:
# get the text off the clipboard
text = pyperclip.paste()

# extract the email/phone from this text
extractedPhone = phoneRegex.findall(text)
extractedEmail = emailRegex.findall(text)

allPhoneNumbers = []
for phoneNumber in extractedPhone:
    allPhoneNumbers.append(phoneNumber[0])

#print(allPhoneNumbers)
#print(extractedEmail)

# copy the extracted email/phone to the clipboard
results = '\n'.join(allPhoneNumbers) + '\n' + '\n'.join(extractedEmail)
pyperclip.copy(results)