Python for Everybody
## Chapter 11 Regular Expressions

The following page shows a complete list of special characters used in regular expressions: https://docs.python.org/3/library/re.html

Don't try to read and remember everything all at once. The best way to learn regular expression is to learn what you need when you need it.


In [None]:
# Some examples

import re
fhand = open("text/a_dream_within_a_dream.txt")

print("# if there is at least one \'in\' in the line")
for line in fhand:
    res = re.search('in', line.strip().lower())             # 'in'  -- at least one 'in'
    if res:
        print(line.strip())
fhand.close()

fhand = open("text/a_dream_within_a_dream.txt")
print("\n# if the line starts with \'in\'")
for line in fhand:
    res = re.search('^in', line.strip().lower())              # '^in'   -- starts with 'in'
    if res:
        print(line.strip())
fhand.close()

fhand = open("text/a_dream_within_a_dream.txt")
print("\n# if the line ends with \'?\'")
for line in fhand:
    res = re.search('\?$', line.strip().lower())              # '\?$'   -- ends with '?'
    if res:
        print(line.strip())
fhand.close()

fhand = open("text/a_dream_within_a_dream.txt")
print("\n# if \'in\' is surrounded by white space characters")
for line in fhand:
    res = re.search('\\sin\\s', line.strip().lower())              # '\\sin\\s'   -- \\s matches a white space character
    if res:
        print(line.strip())
fhand.close()

fhand = open("text/a_dream_within_a_dream.txt")
print("\n# if \'in\' is surrounded by NON white space characters")
for line in fhand:    
    res = re.search('\\Sin\\S', line.strip().lower())              # '\\Sin\\S'    -- \\S matches a NON white space character
    if res:        
        print(line.strip())
fhand.close()


In [None]:
# 11.1 Character matching in regular expressions

# Search for lines that start with 'F', followed by
# 2 characters, followed by 'm:'
import re
hand = open('text/mbox-short.txt')
for line in hand:
    line = line.rstrip()
    if re.search('^F..m:', line):    # . (dot) matches a single character, except the newline chracter.
        print(line)


In [None]:
# More examples

import re

#url = '<p>Hello World</p><a href="http://example.com">More Examples</a><a href="http://example2.com">Even More Examples</a>'
#urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', url)
#print(urls)

s= '(412) 234-5678'
print(re.findall('\([0-9]+\)', s))    # find the area code
print(re.findall('\\s[0-9]+', s))     # find the exchange
print(re.findall('[0-9]+$', s))       # find the phone number

q = "Two things are infinite: the universe and human stupidity; and I\'m not sure about the universe. â€• Albert Einstein"

# one or more characters followed by a space then 'universe'. (i.e., bi-gram ends with 'universe')
print(re.findall('\w+\suniverse', q))  

# 'universe' followed by a space, a word, a space, a word (i.e., tri-gram starts with 'universe')
print(re.findall('universe\s\w+\s\w+', q))  

# '.*' matches any number of characters. (i.e., a pattern begins and ends with 'universe')
print(re.findall('universe.*universe', q))  

# '|' (vertical bar) can be used to list options. (i.e., 'infinite or 'human')
print(re.findall('(infinite|human)', q))  

# You can construct a query in your code.
w1 = 'infinite'
w2 = 'human'
query1 = w1 + "|" + w2
query2 = w1 + ".*" + w2
query3 = "\w+\\s" + w2 + "\\s\w+"
print(re.findall(query1, q))
print(re.findall(query2, q))
print(re.findall(query3, q))            


In [None]:
# 11.1 (cont.)

# Search for lines that start with From and have an at sign
import re
hand = open('text/mbox-short.txt')
for line in hand:
    line = line.rstrip()
    if re.search('^From:.+@', line):
        print(line)

In [None]:
# 11.2 Extracting data using regular expressions

import re
s = 'A message from csev@umich.edu to cwen@iupui.edu about meeting @2PM'
lst = re.findall('\S+@\S+', s)     # '\S+' = multiple non-white-space characteres
print(lst)

s = 'A message from csev@umich.edu to cwen@iupui.edu about meeting @2PM'
lst = re.findall('\w+\\s\w+(?=@)', s)    # '(?=...)'  if ... matches next
print(lst)

In [None]:
# 11.3 Combining Searching and Extracting

# Search for lines that start with 'X' followed by any non
# whitespace characters and ':'
# followed by a space and any number.
# The number can include a decimal.
import re
hand = open('text/mbox-short.txt')
for line in hand:
    line = line.rstrip()
    if re.search('^X\S*: [0-9.]+', line):
        print(line)
        

In [None]:
# 11.3 (cont. #2)

# Search for lines that start with 'X' followed by any
# non whitespace characters and ':' followed by a space
# and any number. The number can include a decimal.
# Then print the number if it is greater than zero.
import re
hand = open('text/mbox-short.txt')
for line in hand:
    line = line.rstrip()
    x = re.findall('^X\S*: ([0-9.]+)', line)
    if len(x) > 0:
        print(x)

In [None]:
# 11.3 (cont. #3)

# Search for lines that start with 'Details: rev='
# followed by numbers and '.'
# Then print the number if it is greater than zero
import re
hand = open('text/mbox-short.txt')
for line in hand:
    line = line.rstrip()
    x = re.findall('^Details:.*rev=([0-9.]+)', line)
    if len(x) > 0:
        print(x)

In [None]:
# 11.3 (cont. #4)

# Search for lines that start with From and a character
# followed by a two digit number between 00 and 99 followed by ':'
# Then print the number if it is greater than zero
import re
hand = open('text/mbox-short.txt')
for line in hand:
    line = line.rstrip()
    x = re.findall('^From .* ([0-9][0-9]):', line)
    if len(x) > 0: 
        print(x)
        

In [None]:
# 11.4 Escape Character
import re
x = 'We just received $10.00 for cookies.'
y = re.findall('\$[0-9.]+',x)
print(y)

In [None]:
# 11.5 Summary



In [None]:
# 11.6 Bonus Section for Unix / Linux users

# This section is optional