# Regular Expressions—Very Simple Examples

The following page shows a complete list of special chracters used in regular expressions:

https://docs.python.org/3/library/re.html

In [4]:
# Using 're.search()' 
import re

s = """
Republicans and Democrats have used accounting gimmicks and competing 
government analyses to deceive the public into believing that 2 + 2 = 6. If 
our leaders cannot agree on the numbers, if \'facts\' are fictional, how can 
they possibly have a substantive debate on solutions? - J. C. Watts
"""

# re.search() function
res = re.search('gimmicks', s)  # 'gimmicks' (literal match)
if res != None:
    print(res.group())

# !! We don't need to use regular expressions to find out if a particular string is in another string. 
# !! You can use the 'find' function. But, if you need to do more complicated pattern matching, you
# !! need to use the regular expressions package (i.e., 're')

# re.search() function
res = re.search('\w+\\sto\\s\w+', s)   # 'to' followed by a single space ('\\s') followed by multiple chraacters ('\w+')
if res != None:
    print(res.group())

# re.search() function
res = re.search('[0-9].*[0-9]', s)  # 2 numbers with any number of chracters between them.
if res != None:
    print(res.group())


gimmicks
analyses to deceive
2 + 2 = 6


In [30]:
# Working with Match Objects...
import re

s = """Republicans and Democrats have used accounting gimmicks and competing 
government analyses to deceive the public into believing that 2 + 2 = 6. If 
our leaders cannot agree on the numbers, if \'facts\' are fictional, how can 
they possibly have a substantive debate on solutions? - J. C. Watts
"""

# re.search() returns a match object, if there is a match.
# if there is no match, it returns 'None'
res = re.search('gimmicks', s)  # 'gimmicks' (literal match)

# Note — 'res' is a variable that is now holding a Match Object (if there is a match)

if res != None:          # if 'res' is not equal to 'None' (meaning there is a match)
    print(res.group())   # If there is one matching group, group() returns a string. This is usually the case.
    print(res.start())   # the starting index of the match
    print(res.end())     # the ending index of the match
    print(res.span())    # a tuple with the starting and the ending indexes.
    print(res.string)    # returns the original string passed to the function


gimmicks
47
55
(47, 55)
Republicans and Democrats have used accounting gimmicks and competing 
government analyses to deceive the public into believing that 2 + 2 = 6. If 
our leaders cannot agree on the numbers, if 'facts' are fictional, how can 
they possibly have a substantive debate on solutions? - J. C. Watts



In [36]:
# Using 're.findall()' and 're.finditer()' functions

fin = open("text/moby_dick.txt")  # open a file
s = fin.read()                    # read the entire text and assign it to 's'. It's a string.

matches = re.findall("my\\s\w+", s)   # find all 2 word sequence that starts wtih 'my' in 's'.
print(matches)                        # re.findall() returns a list of matched strings.
print("----")
for match in matches:                 # loop through the items in the list of matches
    print(match)
    

['my purse', 'my soul', 'my hypos', 'my substitute']
----
my purse
my soul
my hypos
my substitute


In [2]:
import re

fin = open("text/moby_dick.txt")  # open a file
s = fin.read()                    # read the entire text and assign it to 's'. It's a string.

print("----")
for m in re.finditer("my\\s\w+", s):      # finditer() let you loop through matches easily
                                          # 'm' is a Match Object 
    print(m.group())  
    print(m.start())
    

----
my purse
91
my soul
398
my hypos
552
my substitute
804


In [6]:
# Lookahead example
import re

s = """Republicans and Democrats have used accounting gimmicks and competing 
government analyses to deceive the public into believing that 2 + 2 = 6. If 
our leaders cannot agree on the numbers, if \'facts\' are fictional, how can 
they po
"""

# example 1. Pattern = a word + a space (followed by 'gimmicks') but not includeing 'gimmicks'
res = re.search('\w+\\s(?=gimmicks)', s) 
if res != None:
    print(res.group())
    
# example 2. Pattern = a word + a space (followed by a single digit) but not including the digit.
res = re.search('\w+\\s(?=[0-9])', s) 
if res != None:
    print(res.group())

accounting 
that 
