# Regular Expressions—Very Simple Examples

The following page shows a complete list of special characters used in regular expressions:

https://docs.python.org/3/library/re.html

In [4]:
# Using 're.search()' 
import re

s = """
Republicans and Democrats have used accounting gimmicks and competing 
government analyses to deceive the public into believing that 2 + 2 = 6. If 
our leaders cannot agree on the numbers, if \'facts\' are fictional, how can 
they possibly have a substantive debate on solutions? - J. C. Watts
"""

# re.search() function
res = re.search('gimmicks', s)  # 'gimmicks' (literal match)
if res != None:
    print(res.group())

# !! We don't need to use regular expressions to find whether a particular string is in another string. 
# !! You can use the 'find' function. But, if you need to do more complicated pattern matching, you
# !! need to use the regular expressions package (i.e., 're')

# re.search() function
res = re.search('\w+\\sto\\s\w+', s)   # 'to' followed by a single space ('\\s') followed by multiple chraacters ('\w+')
if res != None:
    print(res.group())

# re.search() function
res = re.search('[0-9].*[0-9]', s)  # 2 numbers with any number of chracters between them.
if res != None:
    print(res.group())


gimmicks
analyses to deceive
2 + 2 = 6


In [30]:
# Working with Match Objects...
import re

s = """Republicans and Democrats have used accounting gimmicks and competing 
government analyses to deceive the public into believing that 2 + 2 = 6. If 
our leaders cannot agree on the numbers, if \'facts\' are fictional, how can 
they possibly have a substantive debate on solutions? - J. C. Watts
"""

# re.search() returns a match object, if there is a match.
# if there is no match, it returns 'None'
res = re.search('gimmicks', s)  # 'gimmicks' (literal match)

# Note — 'res' is a variable now holds a Match Object (if there is a match)

if res != None:          # if 'res' is not equal to 'None' (meaning there is a match)
    print(res.group())   # If there is one matching group, group() usually returns a string.
    print(res.end())     # the ending index of the match
    print(res.span())    # a tuple with the starting and the ending indexes.
    print(res.string)    # returns the original string passed to the function


gimmicks
47
55
(47, 55)
Republicans and Democrats have used accounting gimmicks and competing 
government analyses to deceive the public into believing that 2 + 2 = 6. If 
our leaders cannot agree on the numbers, if 'facts' are fictional, how can 
they possibly have a substantive debate on solutions? - J. C. Watts



In [36]:
# Using 're.findall()' and 're.finditer()' functions

fin = open("text/moby_dick.txt")  # open a file
s = fin.read()                    # read the entire text and assign it to 's'. This is a string.

matches = re.findall("my\\s\w+", s)   # find all 2 word sequence that starts wtih 'my' in 's'.
print(matches)                        # re.findall() returns a list of matched strings.
print("----")
for match in matches:                 # loop through items in the list of matches
    print(match)
    

['my purse', 'my soul', 'my hypos', 'my substitute']
----
my purse
my soul
my hypos
my substitute


In [2]:
import re

fin = open("text/moby_dick.txt")  # open a file
s = fin.read()                    # read the entire text and assign it to 's'. This is a string.

print("----")
for m in re.finditer("my\\s\w+", s):      # finditer() loops through matches easily
                                          # 'm' is a Match Object 
    print(m.group())  
    print(m.start())
    

----
my purse
91
my soul
398
my hypos
552
my substitute
804


In [6]:
# Lookahead example
import re

s = """Republicans and Democrats have used accounting gimmicks and competing 
government analyses to deceive the public into believing that 2 + 2 = 6. If 
our leaders cannot agree on the numbers, if \'facts\' are fictional, how can 
they po
"""

# example 1. Pattern = a word + a space (followed by 'gimmicks') but not including 'gimmicks'
res = re.search('\w+\\s(?=gimmicks)', s) 
if res != None:
    print(res.group())
    
# example 2. Pattern = a word + a space (followed by a single digit) but not including the digit.
res = re.search('\w+\\s(?=[0-9])', s) 
if res != None:
    print(res.group())

accounting 
that 


### Commonly Used Regular Expressions

In [12]:
import re

s = "The United States presidential election of 2016 will be the 58th quadrennial U.S. presidential election."
print( re.findall('[0-9]', s) )        # '[0-9]' — matches a sigle number
print( re.findall('[0-9]+', s) )       # '[0-9]' — matches one ore more numbers

print("---")
print( re.findall('\w', s) )           # '\w' — matches a single non-space character (this is not very useful).

print("---")
print( re.findall('\w+', s) )          # '\w+' — one or more non-space characters. This pattern matches a word.

print("---")
print( re.findall('\w+\s\w+', s) )     # '\w+\s\w+' – matches a word, a space, then a word

print("---")
print( re.findall('\w+\.\w+', s) )     # '\w+\.\w+' — matches a word, a period, then a word
                                       # '\.' — matches a period. 
print("---")    
print( re.findall('.2016', s))        # '.' a period alone (without a back slash) matches  
                                       # any single character including a space character.
print("---")    
print( re.findall('.*2016', s))        # '.' a period alone (without a back slash) matches  
                                       # any single character including a space character.
                                       # '.*' matches one or more characters, including spaces.
print("---")
print( re.findall('of\s[0-9]+', s) )   # "of" followed by a space and one or more numbers




['2', '0', '1', '6', '5', '8']
['2016', '58']
---
['T', 'h', 'e', 'U', 'n', 'i', 't', 'e', 'd', 'S', 't', 'a', 't', 'e', 's', 'p', 'r', 'e', 's', 'i', 'd', 'e', 'n', 't', 'i', 'a', 'l', 'e', 'l', 'e', 'c', 't', 'i', 'o', 'n', 'o', 'f', '2', '0', '1', '6', 'w', 'i', 'l', 'l', 'b', 'e', 't', 'h', 'e', '5', '8', 't', 'h', 'q', 'u', 'a', 'd', 'r', 'e', 'n', 'n', 'i', 'a', 'l', 'U', 'S', 'p', 'r', 'e', 's', 'i', 'd', 'e', 'n', 't', 'i', 'a', 'l', 'e', 'l', 'e', 'c', 't', 'i', 'o', 'n']
---
['The', 'United', 'States', 'presidential', 'election', 'of', '2016', 'will', 'be', 'the', '58th', 'quadrennial', 'U', 'S', 'presidential', 'election']
---
['The United', 'States presidential', 'election of', '2016 will', 'be the', '58th quadrennial', 'presidential election']
---
['U.S']
---
['f 2016']
---
['The United States presidential election of 2016']
---
['of 2016']


In [43]:
import re

s1 = "One million (1,000,000) or one thousand thousand is the natural number following 999,999 and preceding 1,000,001"
s2 = "Chinese officials signaled plans to diversify the nation's $1.9 trillion reserve."

print( re.findall('\([0-9,]+\)', s1))   # matches one or more numbers surrounded by parentheses
                                        # you need the backslash '\' character before '(' and ')'
print( re.findall('\$[0-9.]+', s2))     # matches one or more numbers following a dollar sign. 
                                        # you need the backslash '\' character before the dollar sign.

print( re.findall('\w+\sthe\s\w+', s1)) # matches a trigram with "the" in the middle (in s1)
print( re.findall('\w+\sthe\s\w+', s2)) # matches a trigram with "the" in the middle (in s2)


['(1,000,000)']
['$1.9']
['diversify the nation']
['is the natural']
