## 2. Regular Expressions
### 2.1.1 Basic Regular Expression Patterns

In [2]:
import re
def print_regex_result(match, text):
    if match:
        highlighted_text = text[:match.start()] + "\033[1;31m" + match.group() + "\033[0m" + text[match.end():]
        print(highlighted_text)
    else:
        print(text)

In [146]:
# Figure 2.1
text = "interesting links to woodchucks and lemurs"
pattern = "woodchucks"

# Use re to search for the pattern in the text
match = re.search(pattern,text)
print_regex_result(match,text)

interesting links to [1;31mwoodchucks[0m and lemurs


In [147]:
texts = ["Mary Ann stopped by Mona’s", "“You’ve left the burglar behind again!” said Nori"]
patterns = ["a", "!"]

for text, pattern in zip(texts, patterns):
    match = re.search(pattern,text)
    print_regex_result(match,text)

M[1;31ma[0mry Ann stopped by Mona’s
“You’ve left the burglar behind again[1;31m![0m” said Nori


In [148]:
# Figure 2.2
text = "Woodchuck and woodchuck are both mentioned. woodchuck again."

pattern = r"[wW]oodchuck"

match = re.search(pattern,text)
print_regex_result(match,text)

[1;31mWoodchuck[0m and woodchuck are both mentioned. woodchuck again.


In [149]:
texts = ["In uomini, in soldati", "plenty of 7 to 5"]
patterns = ["[abc]", "[1234567890]"]

for text, pattern in zip(texts, patterns):
    match = re.search(pattern,text)
    print_regex_result(match,text)

In uomini, in sold[1;31ma[0mti
plenty of [1;31m7[0m to 5


In [150]:
# Figure 2.3
texts = ["we should call it ‘Drenched Blossoms’ ", "my beans were impatient to be hoed!","Chapter 1: Down the Rabbit Hole"]
patterns = ["[A-Z]", "[a-z]","[0-9]"]

for text, pattern in zip(texts, patterns):
    match = re.search(pattern,text)
    print_regex_result(match,text)

we should call it ‘[1;31mD[0mrenched Blossoms’ 
[1;31mm[0my beans were impatient to be hoed!
Chapter [1;31m1[0m: Down the Rabbit Hole


In [151]:
# Figure 2.4
texts = ["Oyfn pripetchik", "I have no exquisite reason for’t","our resident Djinn"]
patterns = ["[^A-Z]", "[^Ss]","[^.]"]

for text, pattern in zip(texts, patterns):
    match = re.search(pattern,text)
    print_regex_result(match,text)

O[1;31my[0mfn pripetchik
[1;31mI[0m have no exquisite reason for’t
[1;31mo[0mur resident Djinn


In [152]:
# Figure 2.4
texts = ["look up ^ now","look up a^b now"]
patterns = ["[e^]",r"a\^b"]#r"a\^b" is the raw string representation of "a\^b"

for text, pattern in zip(texts, patterns):
    match = re.search(pattern,text)
    print_regex_result(match,text)

look up [1;31m^[0m now
look up [1;31ma^b[0m now


In [153]:
# Figure 2.5
texts = ["woodchuck","color"]
patterns = ["woodchucks?","colou?r"]

for text, pattern in zip(texts, patterns):
    match = re.search(pattern,text)
    print_regex_result(match,text)

[1;31mwoodchuck[0m
[1;31mcolor[0m


In [154]:
text = "baaaa"
pattern = ["ba","ba*"]

for p in pattern:
    match = re.search(p,text)
    print_regex_result(match,text)


[1;31mba[0maaa
[1;31mbaaaa[0m


In [155]:
texts = "9999"
pattern = ["[0-9]","[0-9]*"]


for p in pattern:
    match = re.search(p,text)
    print_regex_result(match,text)


baaaa
[1;31m[0mbaaaa


In [156]:
#Figure 2.6
texts = ["begin","began","begun"]
pattern = "beg.n"


for t in texts:
    match = re.search(pattern,t)
    print_regex_result(match,text)

[1;31mbegin[0m
[1;31mbegan[0m
[1;31mbegun[0m


In [157]:
texts = [
    "There are 99 bottles of beer on the wall",
    "There are 299 bottles of beer on the wall",
    "There are $99 bottles of beer on the wall"
]
pattern = r"\b99\b"

for t in texts:
    match = re.search(pattern, t)
    print_regex_result(match, t)


There are [1;31m99[0m bottles of beer on the wall
There are 299 bottles of beer on the wall
There are $[1;31m99[0m bottles of beer on the wall


In [158]:
texts = ["There are 99 bottles of beer on the wall","There are 299 bottles of beer on the wall","There are $99 bottles of beer on the wall"]
pattern = [r"\b99\b"]
print(r'\b99\b')
for t in texts:
        match = re.search(p,t)
        print_regex_result(match,t)

\b99\b
[1;31m[0mThere are 99 bottles of beer on the wall
[1;31m[0mThere are 299 bottles of beer on the wall
[1;31m[0mThere are $99 bottles of beer on the wall


## 2.1.2 Disjunction, Grouping, and Precedence

In [159]:
# Disjunction
text = "I have a cat and a dog"
pattern = "cat|dog"
match = re.search(pattern,text)
print_regex_result(match,text)

I have a [1;31mcat[0m and a dog


In [160]:
#precedence
text = "I have guppies"
pattern ="gupp(y|ies)"
match = re.search(pattern,text)
print_regex_result(match,text)

I have [1;31mguppies[0m


In [161]:
text = "Column 1 Column 2 Column 3"
pattern =["Column [0-9]+ *","(Column [0-9]+ *)*"]
for p in pattern:
    match = re.search(p,text)
    print_regex_result(match,text)


[1;31mColumn 1 [0mColumn 2 Column 3
[1;31mColumn 1 Column 2 Column 3[0m


In [162]:
# operator precedence
text= ["the","thethethe","theee"]
pattern="the*"
print("counters * has higher operator precedence than sequence:")
print("pattern:",pattern)
for t in text:
    match = re.search(pattern,t)
    print_regex_result(match,t)
text=["the","any","thany","theny"]
pattern="the|any"
print("sequence has higher operator precedence than disjunction:")
print("pattern:",pattern)
for t in text:
    match = re.search(pattern,t)
    print_regex_result(match,t)


counters * has higher operator precedence than sequence:
pattern: the*
[1;31mthe[0m
[1;31mthe[0mthethe
[1;31mtheee[0m
sequence has higher operator precedence than disjunction:
pattern: the|any
[1;31mthe[0m
[1;31many[0m
th[1;31many[0m
[1;31mthe[0mny


## 2.1.3 A simple example

In [163]:
# How to find the word "the" in a text?
text = ["the","The","other","theft","the,","the2"]
pattern="(^|[^a-zA-Z])[tT]he([^a-zA-Z]|$)"
for t in text:
    match = re.search(pattern,t)
    print_regex_result(match,t)

[1;31mthe[0m
[1;31mThe[0m
other
theft
[1;31mthe,[0m
[1;31mthe2[0m


## 2.1.4 More Operators

In [164]:
texts = ["number5","Blue","Red","!!!","in red","inch"]
patterns = [r"\d", r"\D",r"\w",r"\W",r"\s",r"\S"]

for text, pattern in zip(texts, patterns):
    match = re.search(pattern,text)
    print_regex_result(match,text)

number[1;31m5[0m
[1;31mB[0mlue
[1;31mR[0med
[1;31m![0m!!
in[1;31m [0mred
[1;31mi[0mnch


In [165]:
def repeat_operations(op):
    print("operations",op)
    text = ["th","the","thee","theee"]
    pattern="the"+op
    for t in text:
        match = re.search(pattern,t)
        print_regex_result(match,t)
repeat_operations("*")
repeat_operations("+")
repeat_operations("?")
repeat_operations("{1}")
repeat_operations("{1,2}")
repeat_operations("{1,}")
repeat_operations("{,2}")

operations *
[1;31mth[0m
[1;31mthe[0m
[1;31mthee[0m
[1;31mtheee[0m
operations +
th
[1;31mthe[0m
[1;31mthee[0m
[1;31mtheee[0m
operations ?
[1;31mth[0m
[1;31mthe[0m
[1;31mthe[0me
[1;31mthe[0mee
operations {1}
th
[1;31mthe[0m
[1;31mthe[0me
[1;31mthe[0mee
operations {1,2}
th
[1;31mthe[0m
[1;31mthee[0m
[1;31mthee[0me
operations {1,}
th
[1;31mthe[0m
[1;31mthee[0m
[1;31mtheee[0m
operations {,2}
[1;31mth[0m
[1;31mthe[0m
[1;31mthee[0m
[1;31mthee[0me


In [166]:
#How to search for special characters?
text="?*."
patterns = [r"\?", r"\*",r"\."]
for pattern in patterns:
    match = re.search(pattern,text)
    print_regex_result(match,text)

[1;31m?[0m*.
?[1;31m*[0m.
?*[1;31m.[0m


## A More Complex Example

In [167]:
#find any machine with at least 6 GHz and 500 GB of disk space for less than $1000
text="6 GHz or 500 GB or Mac or $999.99"
patterns=[r'\b[0-9]+(\.[0-9]+)? *([gG][Hh][Zz])\b',r'\b[0-9]+(\.[0-9]+)? *(GB|[Gg]igabytes?)\b',r'(\$|[D|d]ollars?|USD)+[0-9]+(\.[0-9]+)?']
for pattern in patterns:
    match = re.search(pattern,text)
    print_regex_result(match,text)

[1;31m6 GHz[0m or 500 GB or Mac or $999.99
6 GHz or [1;31m500 GB[0m or Mac or $999.99
6 GHz or 500 GB or Mac or [1;31m$999.99[0m


## 2.1.6 Substitution, Capture Groups, and ELIZA

In [168]:
# substitution
text="color red"
print(re.sub(r"color","colour",text))


colour red


In [169]:
#use the number operator \1 in the second pattern to refer back
text="35 boxes"
print(re.sub(r"([0-9]+)",r"<\1>",text))

<35> boxes


In [7]:
#specify that a certain string or expression must occur twice in the text
text='the faster they were, the faster they will be'
pattern=r"the (.*)er they were, the \1er they will be"
match = re.search(pattern,text)
print_regex_result(match,text)

text="the faster they ran, the faster we ran but not the faster they ran, the faster we ate"
pattern=r"the (.*)er they (.*), the \1er we \2"
match = re.search(pattern,text)
print_regex_result(match,text)

[1;31mthe faster they were, the faster they will be[0m
[1;31mthe faster they ran, the faster we ran[0m but not the faster they ran, the faster we ate


In [12]:
# non-capturing group
text="some cats like some cats but not some cats like some some."
pattern=r"(?:some|a few) (people|cats) like some \1"
match = re.search(pattern,text)
print_regex_result(match,text)

[1;31msome cats like some cats[0m but not some cats like some some.


## 2.1.7 Lookahead Assertions

In [16]:
# at the beginning of a line, any single word that doesn’t start with “Volcano”
text="A Volcano is a volcano"
pattern="^(?!Volcano)[A-Za-z]+"
match = re.search(pattern,text)
print_regex_result(match,text)

[1;31mA[0m Volcano is a volcano


# 2.4 Simple Unix Tools for Word Tokenization

In [26]:
import subprocess
with open('sh.txt', 'r') as file:
    file_content = file.read()
result=subprocess.run(['tr', '-sc', 'A-Za-z', '\n'], input=file_content, text=True, capture_output=True)

print(result.stdout[:120])

THE
SONNETS
by
William
Shakespeare
From
fairest
creatures
we
desire
increase
That
thereby
beauty
s
rose
might
never
die



In [21]:
#Upper case words count
result = subprocess.run("tr -sc 'A-Za-z' '\n' < sh.txt | sort | uniq -c", shell=True, capture_output=True, text=True)
print(result.stdout[:95])

1945 A
  72 AARON
  19 ABBESS
   5 ABBOT
   8 ABERGAVENNY
  18 ABHORSON
   1 ABOUT
  88 ACHILLE


In [20]:
# In lower case
result = subprocess.run("tr -sc 'A-Za-z' '\\n' < sh.txt | tr A-Z a-z | sort | uniq -c", shell=True, capture_output=True, text=True)
print(result.stdout[:95])

14725 a
  97 aaron
   1 abaissiez
  10 abandon
   2 abandoned
   2 abase
   1 abash
  14 abate



In [23]:
#sort the words in descending order of frequency
result = subprocess.run("tr -sc 'A-Za-z' '\\n' < sh.txt | tr A-Z a-z | sort | uniq -c | sort -n -r", shell=True, capture_output=True, text=True)
print(result.stdout[:100])

27594 the
26735 and
22538 i
19771 to
18132 of
14725 a
13826 you
12489 my
11535 that
11112 in
9753 is


# Word Tokenization

In [2]:
import nltk
text = 'That U.S.A. poster-print costs $12.40...'
pattern = r'''(?x)(?:[A-Z]\.)+| \w+(?:-\w+)*| \$?\d+(?:\.\d+)?%?| \.\.\.| [][.,;"'?():_`-]'''
nltk.regexp_tokenize(text, pattern)

ModuleNotFoundError: No module named 'nltk'

In [None]:
# Tokenizor of Chinese and English