In [68]:
import re

# .    - Any Character Except New Line
# \d   - Digit (0-9)
# \D   - Not a Digit (0-9)
# \w   - Word Character (a-z, A-Z, 0-9, _)
# \W   - Not a Word Character
# \s   - Whitespace (space, tab, newline)
# \S   - Not Whitespace (space, tab, newline)

# These are called anchors
# \b   - Word Boundary
# \B   - Not a Word Boundary
# ^    - Beginning of a String
# $    - End of a String

# []   - Matches Characters in brackets
# [^ ] - Matches Characters NOT in brackets
# |    - Either Or
# ( )  - Group

# Quantifiers
# *    - 0 or More
# +    - 1 or More
# ?    - 0 or One
# {3}  - Exact Number
# {3,4}- Range of Numbers (Minimum, Maximum)

# Sample Regexs
# [a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+

text_to_search = '''
abcdefghijklmnopqrstuvwxyz
ABCDEFGHIJKLMNOPQRSTUVWXYZ
1234567890

Ha HaHa

MetaCharacters (Need to be escaped):
. ^ $ * + ? { } [ ] \ | ( )

coreyms.com

321-555-4321
123.555.1234
123*555*1234
321--555-4321
800.555.1234
900.555.1234

Mr. Schafer
Mr Smith
Ms Davis
Mrs. Robinson
Mr. T

cat
mat
pat
bat
'''

sentence = 'Start a sentence and then bring it to an end'

#example of a raw string (prefixed with an r)
print('\tTab') # this is the string that Python will change
print(r'\tTab') # this is the raw string

# Specify pattern of interest
# Search for these using 'text_to_search' in the finditer
pattern = re.compile(r'abc') # will be found in indexes 1 to 4
pattern = re.compile(r'cba') # will not be found
pattern = re.compile(r'.') # will return everything except New Lines
pattern = re.compile(r'\.') # need to backslash the period . for this to work properly
pattern = re.compile(r'coreyms\.com') # need to backslash the period . for this to work properly
pattern = re.compile(r'\d') # will return Digits 0-9
pattern = re.compile(r'\D') # will return NOT Digits 0-9
pattern = re.compile(r'\w') # will return Word Character (a-z, A-Z, 0-9, _)
pattern = re.compile(r'\W') # will return NOT Word Character (a-z, A-Z, 0-9, _)
pattern = re.compile(r'\s') # will return Whitespace (space, tab, newline)
pattern = re.compile(r'\S') # will return NOT Whitespace (space, tab, newline)
pattern = re.compile(r'\bHa') # will return Word Boundary
pattern = re.compile(r'\BHa') # will return NOT Word Boundary
pattern = re.compile(r'\d\d\d.\d\d\d.\d\d\d\d') # will return the phone numbers
pattern = re.compile(r'\d\d\d[-.]\d\d\d[-.]\d\d\d\d') # will return the phone numbers that are separated only by - or .
pattern = re.compile(r'[89]00[-.]\d\d\d[-.]\d\d\d\d') # will return only the 800 and 900 numbers
pattern = re.compile(r'[1-5]') # will return only numbers 1 through 5
pattern = re.compile(r'[a-z]') # will return lowercase letters a to z
pattern = re.compile(r'[a-zA-Z]') # will return both uppercase and lowercase letters a to z
pattern = re.compile(r'[^a-zA-Z]') # will return NOT both uppercase and lowercase letters a to z
pattern = re.compile(r'[^b]at') # will return everything that ends with at EXCEPT for anything that starts with a b
pattern = re.compile(r'\d{3}.\d{3}.\d{4}') # another way to search for phone numbers
pattern = re.compile(r'Mr\.') # finds all Mr.
pattern = re.compile(r'Mr\.?') # finds both Mr. and Mr
pattern = re.compile(r'Mr\.?\s[A-Z]') # finds Mr./Mr and the first letter of the last names
pattern = re.compile(r'Mr\.?\s[A-Z]\w+') # finds Mr./Mr and the full last name of last names that have more than one letter
pattern = re.compile(r'Mr\.?\s[A-Z]\w*') # finds Mr./Mr and the full last name of last names
pattern = re.compile(r'M(r|s|rs)\.?\s[A-Z]\w*') # finds Mr./Mr/Ms/Mrs and the full last name of last names
pattern = re.compile(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*') # alternate method
pattern = re.compile(r'\d{3}.\d{3}.\d{4}') # phone numbers for findall

# Search for these using 'sentence' in the finditer
pattern = re.compile(r'^Start') # will return Beginning of a String
pattern = re.compile(r'^a') # will return Beginning of a String
pattern = re.compile(r'end$') # will return End of a String
pattern = re.compile(r'a$') # will return End of a String

# Match method examples using 'sentence' variable
pattern = re.compile(r'Start') # will return the first value
pattern = re.compile(r'sentence') # will return as None using the .match method as match only picks up the first item
# string 'sentence' is in the middle of the sentence variable

# Search method examples using 'sentence' variable
pattern = re.compile(r'sentence') # will return the string 'sentence'
pattern = re.compile(r'dne') # will return None
pattern = re.compile(r'start', re.IGNORECASE) # use of flags, will find start regardless of capitalization
pattern = re.compile(r'start', re.I) # alternative method of above

# Specify area to search for pattern
matches = pattern.finditer(text_to_search)
matches = pattern.findall(text_to_search) # findall method will just return the matches as a list of strings
# if it's matching groups, it will only return the groups (ex. Mr Ms Mrs)
matches = pattern.finditer(sentence)
matches = pattern.match(sentence) # print matches directly, don't use for loop
# only matches things at the beginning of strings
matches = pattern.search(sentence) # print matches directly, don't use for loop
# matches at any location within a string, but only the first instance

print(matches)

# Return an iterator that contains all of the matches
for match in matches:
    print(match)
    
print(text_to_search[1:4]) # print out string indexes 1 to 4

# --------------------------------------------------------------------------------------

<re.Match object; span=(0, 5), match='Start'>


In [32]:
# This section used with the data.txt file
with open('data.txt', 'r') as f:
    contents = f.read()
    
    matches = pattern.finditer(contents)
    
    for match in matches:
        print(match)

<re.Match object; span=(102, 114), match='800-555-5669'>
<re.Match object; span=(281, 293), match='900-555-9340'>
<re.Match object; span=(467, 479), match='800-555-6771'>
<re.Match object; span=(1093, 1105), match='900-555-3205'>
<re.Match object; span=(1443, 1455), match='800-555-6089'>
<re.Match object; span=(1794, 1806), match='800-555-7100'>
<re.Match object; span=(2055, 2067), match='900-555-5118'>
<re.Match object; span=(2830, 2842), match='900-555-5428'>
<re.Match object; span=(3290, 3302), match='800-555-8810'>
<re.Match object; span=(3977, 3989), match='900-555-9598'>
<re.Match object; span=(4951, 4963), match='800-555-2420'>
<re.Match object; span=(5572, 5584), match='900-555-3567'>
<re.Match object; span=(6195, 6207), match='800-555-3216'>
<re.Match object; span=(6897, 6909), match='900-555-7755'>
<re.Match object; span=(7872, 7884), match='800-555-1372'>


In [49]:
emails = '''
CoreyMSchafer@gmail.com
corey.schafer@university.edu
corey-321-schafer@my-work.net
'''

pattern = re.compile(r'[a-zA-Z]+@[a-zA-Z]+\.com') # matches email address that ends in .com
pattern = re.compile(r'[a-zA-Z.]+@[a-zA-Z]+\.(com|edu)') # matches email address that ends in .com or .edu and contains . in first portion
pattern = re.compile(r'[a-zA-Z0-9.-]+@[a-zA-Z-]+\.(com|edu|net)') # matches email address that ends in .com or .edu and contains . in first portion
pattern = re.compile(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+') # example written by someone else

matches = pattern.finditer(emails)

for match in matches:
    print(match)

<re.Match object; span=(1, 24), match='CoreyMSchafer@gmail.com'>
<re.Match object; span=(25, 53), match='corey.schafer@university.edu'>
<re.Match object; span=(54, 83), match='corey-321-schafer@my-work.net'>


In [59]:
import re

urls = '''
https://www.google.com
http://coreyms.com
https://youtube.com
https://www.nasa.gov
'''

pattern = re.compile(r'https?://(www\.)?') # matches up to domain name
pattern = re.compile(r'https?://(www\.)?\w+\.\w+') # matches entire url
pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)') # added groups
# 1st group is optional www, 2nd group is domain name, 3rd group is top level domain (.com/.gov)
# Group 0 is the entire url itself

matches = pattern.finditer(urls)

# General looping
for match in matches:
    print(match)

# Loop to show groups
for match in matches:
    print(match.group(0))

subbed_urls = pattern.sub(r'\2\3', urls) # substitutes input urls with groups 2 and 3 of regex

print(subbed_urls)


google.com
coreyms.com
youtube.com
nasa.gov

