## Chapter 2 - Strings and Text

### 2.1) splitting string on multiple delimeters

In [1]:
import re
line = 'asdf   fjdk; afed, fjek, asdf,       foo'
line

'asdf   fjdk; afed, fjek, asdf,       foo'

In [37]:
line.split(",") # no doesn't work

['asdf   fjdk; afed', ' fjek', ' asdf', '       foo']

In [38]:
re.split(r'[,]',line)

['asdf   fjdk; afed', ' fjek', ' asdf', '       foo']

In [41]:
# split on , colon and any white space character
re.split(r'[,;\s]',line)

['asdf',
 '',
 '',
 'fjdk',
 '',
 'afed',
 '',
 'fjek',
 '',
 'asdf',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'foo']

In [42]:
# split on , colon and white space character, followed by any number of white space character
re.split(r'[,;\s]\s*', line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [44]:
# capture groups- watch the parenthesis
fields = re.split(r'(,|;|\s)\s*', line)

In [45]:
values = fields[::2]

In [46]:
values

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [48]:
delimeters = fields[1::2] + ['']

In [49]:
delimeters

[' ', ';', ',', ',', ',', '']

In [50]:
",".join(v+d for v,d in zip(values, delimeters))

'asdf ,fjdk;,afed,,fjek,,asdf,,foo'

In [51]:
re.split(r'(?:,|;|\s)\s*', line) # don't capture it as a group

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

### some practice

### match pattern- re compiles the pattern and converts it to bytecode

In [70]:
pattern = re.compile(r'\bfoo\b')  # matches only foo in a sentence, not food, foor, just 'foo'
pattern.match("foo bar is a lazy food, but hey, its a foo foo foo out of here")

<_sre.SRE_Match object; span=(0, 3), match='foo'>

In [79]:
pattern = re.compile(r'<HTML>')
pattern.match(" <HTML>") # empty object

In [78]:
pattern.match("  <HTML>",2)

<_sre.SRE_Match object; span=(2, 8), match='<HTML>'>

In [99]:
pattern = re.compile(r'^<HTML>')
pattern.match("<HTML>")

<_sre.SRE_Match object; span=(0, 6), match='<HTML>'>

In [90]:
pattern.match("  <HTML>"[2:])

<_sre.SRE_Match object; span=(0, 6), match='<HTML>'>

In [106]:
pattern.match("<HTML>",0)

<_sre.SRE_Match object; span=(0, 6), match='<HTML>'>

In [95]:
pattern = re.compile(r'<HTML>')
pattern.match("<HTML>")

<_sre.SRE_Match object; span=(0, 6), match='<HTML>'>

In [96]:
pattern.match("<HTML>", 0, 2)

In [111]:
pattern = re.compile(r'<HTML>$')
pattern.match("<HTML>  ",0,6)

<_sre.SRE_Match object; span=(0, 6), match='<HTML>'>

In [112]:
pattern.match("<HTML>  "[:6])

<_sre.SRE_Match object; span=(0, 6), match='<HTML>'>

In [123]:
pattern = re.compile(r'world')
pattern.match("hello world") # none since match only looks for pattern at beginning of string

In [124]:
pattern.search("hello world")

<_sre.SRE_Match object; span=(6, 11), match='world'>

In [117]:
pattern.search("hola mundo")

In [125]:
pattern.search("  world")

<_sre.SRE_Match object; span=(2, 7), match='world'>

In [126]:
pattern = re.compile(r'^world')
pattern.search("world")

<_sre.SRE_Match object; span=(0, 5), match='world'>

In [129]:
pattern.search(" world") # ^ matches beginning of string and beginning of each line , hence no match here
# since there is a space at the beginning


In [132]:
pattern = re.compile(r'^<HTML>', re.MULTILINE)

In [133]:
pattern.search("<HTML>") # works because its at beginning of string

<_sre.SRE_Match object; span=(0, 6), match='<HTML>'>

In [134]:
pattern.search(" <HTML>")

In [135]:
pattern.search("  \n<HTML>") # right after new line due to multiline

<_sre.SRE_Match object; span=(3, 9), match='<HTML>'>

In [139]:
pattern.search("abcd<lala>\n<HTML>")

<_sre.SRE_Match object; span=(11, 17), match='<HTML>'>

### 2.2) Matching text at the start or end of a string

In [140]:
filename = 'spam.txt'
filename.endswith(".txt")

True

In [141]:
filename.startswith("file:")

False

In [143]:
filename = 'spam.py'
filename.endswith(('.txt','py'))

True

In [145]:
# we can also do this with regex
pattern = re.compile(r'[http|https|ftp|txt]')
url = 'https://www.python.org'
pattern.match(url)

<_sre.SRE_Match object; span=(0, 1), match='h'>

In [149]:
import os
if any(name.endswith(('.git','.py')) for name in os.listdir('.')):
    print("its found nigga")

its found nigga


### 2.3) Matching strings using shell wildcard patterns

In [199]:
pattern = re.compile("[hH]ello")
texts = "Hello there, my name is raj, as you can see hello, and Hello hello, are two different Hello".split(",")

In [202]:
count = 0
for line in texts:
    if pattern.search(line):
        count += 1


In [203]:
count

4

In [204]:
texts

['Hello there',
 ' my name is raj',
 ' as you can see hello',
 ' and Hello hello',
 ' are two different Hello']

In [13]:
def found(pattern, List):
    for item in List:
        if pattern.search(item):
            print("found")
        else:
            print("not found")

In [14]:
pattern = re.compile("licen[sc]e")
text = ['english license', 'american licence']
found(pattern, text)

found
found


In [15]:
pattern = re.compile("\D")
found(pattern, text)

found
found


In [16]:
texts = ["licence: yes", "licence: no"]
pattern = re.compile("licence: yes|no")
found(pattern, texts)

found
found


In [17]:
pattern = re.compile("no")
found(pattern, texts)

not found
found


In [19]:
pattern = re.compile("licence: [yesno]")
found(pattern, texts)

found
found


In [22]:
found(re.compile('licence: (yes|no)'), texts)

found
found


In [53]:
pattern = re.compile("colou+r")
pattern.search("colouuuuuuuur")


<_sre.SRE_Match object; span=(0, 13), match='colouuuuuuuur'>

In [77]:
pattern = re.compile("cars{6,}?")
pattern.search("carssssssss")

<_sre.SRE_Match object; span=(0, 9), match='carssssss'>

In [80]:
pattern = re.compile("hello")
x =pattern.search("uehellos")

In [91]:
"uehellos"[x.start(): x.end()]

'hello'

In [98]:
pattern = re.compile(r"\bhello\b")
pattern.search("uehellos")

In [100]:
pattern.search("hello there")

<_sre.SRE_Match object; span=(0, 5), match='hello'>

In [102]:
line

'asdf   fjdk; afed, fjek, asdf,       foo'

In [149]:
pattern = re.compile("[,;\s]\s+")

In [150]:
pattern.split(line)

['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [158]:
"\\the" == r"\the"

True

In [159]:
print(r"\the")

\the


In [160]:
print("\\the")

\the


In [163]:
pattern = re.compile(r"\\ten")

In [166]:
pattern.search("\\ten")

<_sre.SRE_Match object; span=(0, 4), match='\\ten'>

In [168]:
pattern = re.compile(r'\w+')
pattern.findall("hello world")

['hello', 'world']

In [182]:
pattern = re.compile(r'a')
pattern.findall("aba")

['a', 'a']

In [192]:
pattern = re.compile(r'a?')
pattern.findall("aba")

['a', '', 'a', '']

In [198]:
pattern = re.compile(r'(\w+) (\w+)')
pattern.findall("hello world hola mundo hello nonsense maggie well")

[('hello', 'world'),
 ('hola', 'mundo'),
 ('hello', 'nonsense'),
 ('maggie', 'well')]

In [199]:
line

'asdf   fjdk; afed, fjek, asdf,       foo'

In [213]:
pattern = re.compile(r'\w+')
pattern.findall("hello world")

['hello', 'world']

In [219]:
pattern = re.compile(r'\s')
pattern.split("hello world")

['hello', 'world']

In [220]:
pattern = re.compile(f'\W+')
pattern.split("hello world")

['hello', 'world']

In [233]:
re.split(r'(\W)', 'hello world')

['hello', ' ', 'world']

In [237]:
pattern = re.compile(r'(\w+) (\w+)')
it = pattern.finditer("hello world hola mundo hello nonsense maggie well")

In [239]:
match = next(it)

In [241]:
match.groups()

('hello', 'world')

In [250]:
pattern = re.compile(r'[0-9]+')
pattern.sub("-", 'order0, order1, order13')

'order-, order-, order-'

In [251]:
re.sub("00","-", 'order00000')

'order--0'

In [276]:
pattern = re.compile(r'([-|A-Z])')
pattern.findall('-1234 A193 B123 A1234 B193 B123')

['-', 'A', 'B', 'A', 'B', 'B']

In [277]:
def normalize_numbers(matchgroup):
    if matchgroup.group(1) == '-': return 'A'
    return 'B'

re.sub(r'([-|A-Z])', normalize_numbers, '-1234 A193 B123 A1234 B193 B123')

'A1234 B193 B123 B1234 B193 B123'

In [278]:
pattern.search('-1234 A193 B123 A1234 B193 B123').group(1)

'-'

In [289]:
text = 'imagine a new *world*, a magic *world*'
pattern = re.compile(r'\*(\w*?)\*')
pattern.sub("b",text)

'imagine a new b, a magic b'

In [304]:
pattern = re.compile(r'(?P<first>\w+) (?P<second>\w+)')
match = pattern.search("hello world")

In [294]:
match.group('first')

'hello'

In [295]:
match.group("first", 0, 'second')

('hello', 'hello world', 'world')

In [296]:
match.groups()

('hello', 'world')

In [298]:
pattern.search("hello world").groupdict()

{'first': 'hello', 'second': 'world'}

In [301]:
match.start(1)

0

In [307]:
match.end(1)

5

In [306]:
match.group(2)

'world'

In [313]:
re.findall(re.escape("^"), "^like^")

['^', '^']

In [314]:
re.findall(r'\^', "^like^")

['^', '^']

In [322]:
pattern = re.compile(r"([a-z]+)",re.I)
pattern.search("felix")

<_sre.SRE_Match object; span=(0, 5), match='felix'>

In [323]:
if pattern.search("Felix"):
    print("found")

found


In [327]:
pattern = re.compile(r"^\w+\: (\w+/\w+/\w+)")
text = "date: 2012/12/05 \ndate: 2014/12/06"
pattern.findall("date: 2012/12/05 \ndate: 2014/12/06")

['2012/12/05']

In [328]:
pattern = re.compile(r"^\w+\: (\w+/\w+/\w+)", re.M)
pattern.findall(text)

['2012/12/05', '2014/12/06']

In [335]:
chars = "".join(chr(i) for i in range(256))

In [380]:
" ".join(re.findall(r"\w", chars))

'0 1 2 3 4 5 6 7 8 9 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z ª ² ³ µ ¹ º ¼ ½ ¾ À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ'

In [372]:
match = re.search(r"\w", "hello there")

In [378]:
chars

'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0¡¢£¤¥¦§¨©ª«¬\xad®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'

In [381]:
import locale

In [382]:
locale.setlocale(locale.LC_ALL, '')

'en_US.UTF-8'

In [383]:
" ".join(re.findall(r'\w', chars, re.LOCALE))

ValueError: cannot use LOCALE flag with a str pattern

## matching strings using shell wildcard patterns

In [384]:
from fnmatch import fnmatch, fnmatchcase
fnmatch("foo.txt", "*.txt")

True

In [388]:
fnmatch("foo.txt", "?oo.txt")

True

In [389]:
fnmatch("Dat45.csv", "Dat[0-9]*")

True

In [394]:
names = ['Dat1.csv', 'Dat2.csv', 'config.ini', 'foo.py']
[name for name in names if fnmatch(name, "Dat*.csv")]

['Dat1.csv', 'Dat2.csv']

In [395]:
[name for name in names if name.endswith(".csv")]

['Dat1.csv', 'Dat2.csv']

In [398]:
addresses = [
    '5412 N CLARK ST', 
    '1030 w addison ST', 
    '1039 w granville ave', 
    '2122 n clark ST', 
    '4802 n broadway',
]
from fnmatch import fnmatchcase
[addr for addr in addresses if fnmatchcase(addr, '* ST')]

['5412 N CLARK ST', '1030 w addison ST', '2122 n clark ST']

In [405]:
[addr for addr in addresses if fnmatchcase(addr, '5[0-9]*')]

['5412 N CLARK ST']

## 2.4) Matching and searching for text patterns

In [406]:
text = 'yeah, but no, but yea, but no, but yea'
text == 'yeah'

False

In [407]:
text.startswith("yeah")

True

In [408]:
'yeah' in text

True

In [410]:
text.find("no")

10

In [412]:
re.search(r'yeah', text)

<_sre.SRE_Match object; span=(0, 4), match='yeah'>

In [413]:
text = "11/27/2012"
text2 = 'Nov 27, 2012'
if re.match(r"\d+/\d+/\d+", text):
    print("yes")

yes


In [414]:
pattern = re.compile(r"\d+/\d+/\d+")
if pattern.match(text):
    print("yes")
else:
    print("no")

yes


In [415]:
text = 'today is 11/27/2012.  Pycon starts 3/13/2014'
pattern.findall(text)

['11/27/2012', '3/13/2014']

In [425]:
pattern = re.compile(r"(\d+)/(\d+)/(\d+)")
m = pattern.findall(text)

In [426]:
m

[('11', '27', '2012'), ('3', '13', '2014')]

In [427]:
for month, day, year in pattern.findall(text):
    print("{}-{}-{}".format(year, month, day))

2012-11-27
2014-3-13


In [430]:
for m in pattern.finditer(text):
    print("{}-{}-{}".format(*m.groups()))

11-27-2012
3-13-2014


In [434]:
pattern = re.compile(r"(\d+)/(\d+)/(\d+)$")
pattern.match("02/05/2012vvus"), pattern.match("02/05/2012")

(None, <_sre.SRE_Match object; span=(0, 10), match='02/05/2012'>)

## 2.5 Searching and replacing text

In [436]:
text = 'yeah, but no, but yeah, but no, but yeah'
text.replace("yeah", "yep")

'yep, but no, but yep, but no, but yep'

In [438]:
pattern = re.compile("yeah")
pattern.sub("yep", text)

'yep, but no, but yep, but no, but yep'

In [440]:
text = 'today is 11/27/2012.  pycon starts 03/13/2017'
pattern = re.compile(r"(\d+)/(\d+)/(\d+)")
pattern.sub(r"\3-\1-\2", text) # capture group numbers in pattern

'today is 2012-11-27.  pycon starts 2017-03-13'

In [441]:
from calendar import month_abbr
def change_date(m):
    mon_name = month_abbr[int(m.group(1))]
    return '{} {} {}'.format(m.group(2), mon_name, m.group(3))
pattern.sub(change_date, text)

'today is 27 Nov 2012.  pycon starts 13 Mar 2017'

## 2.6 Searching and replacing case-sensitive text

In [466]:
text = "UPPER PYTHON, lower python, Mixed Python"
re.findall("python", text, flags=re.I)

['PYTHON', 'python', 'Python']

In [445]:
re.sub("python", "snake", text, flags=re.I)

'UPPER snake, lower snake, Mixed snake'

In [447]:
def matchcase(word):
    def replace(m):
        text = m.group()
        if text.isupper():
            return word.upper()
        elif text.islower():
            return word.lower()
        elif text[0].isupper():
            return word.capitalize()
        else:
            return word
    return replace

In [467]:
re.sub("python", matchcase("snake"), text, flags=re.I)


'UPPER SNAKE, lower snake, Mixed Snake'

## 2.7) Specifying a regular expression for the shortest match

In [489]:
text2 = 'Computer says "no. " Phone says "yes."'
pattern = re.compile(r'\"(.*)\"')
pattern.findall(text2)  # this is unique to cases where () is enlosed in double quotation marks

['no. " Phone says "yes.']

In [491]:
re.findall(r'\"(.*?)\"', text2) # question mark makes the match short

['no. ', 'yes.']

# 2.8


In [512]:
text1 = '/* this is a comment */'
text2 = '''/* this is a */
             /* multiline comment */
        '''

In [517]:
pattern = re.compile(r'^/\*(.*?)\*/')
pattern.findall(text1)

[' this is a comment ']

In [518]:
pattern.findall(text2)

[' this is a ']

In [519]:
pattern = re.compile(r'/\*(.*?)\*/', re.M)
pattern.findall(text2)

[' this is a ', ' multiline comment ']

In [520]:
pattern = re.compile(r'/\*((?:.|\n)*?)\*/')
pattern.findall(text2)

[' this is a ', ' multiline comment ']

## 2.9 Normalizing unicode text to a standard representation


In [522]:
s1 = "Spicy Jalape\u00f1o"
s2 = "Spicy Jalapen\u0303o"
s1

'Spicy Jalapeño'

In [523]:
s2

'Spicy Jalapeño'

In [524]:
s1 == s2

False

In [526]:
import unicodedata
t1 = unicodedata.normalize('NFC', s1) #characters fully composed
t2 = unicodedata.normalize('NFC', s2)

In [531]:
t3 = unicodedata.normalize("NFD", s1)
t4 = unicodedata.normalize('NFD', s2)

In [532]:
t2

'Spicy Jalapeño'

In [533]:
t1 == t2

True

In [534]:
print(ascii(t1))

'Spicy Jalape\xf1o'


In [535]:
t3==t4

True

In [536]:
"".join(c for c in t1 if not unicodedata.combining(c))

'Spicy Jalapeño'

In [537]:
t1

'Spicy Jalapeño'

## 2.10 working with unicode characters in regular expressions

In [540]:
num = re.compile(r'\d+')
num.match("123 hello 56")

<_sre.SRE_Match object; span=(0, 3), match='123'>

In [541]:
num.findall("123 hello 56")

['123', '56']

In [542]:
num.match("\u0661\u0661\u0663")

<_sre.SRE_Match object; span=(0, 3), match='١١٣'>

## 2.11 stripping unwanted characters from string

In [544]:
s = "       hello      world  \n"
s

'       hello      world  \n'

In [545]:
s.strip()

'hello      world'

In [546]:
s.lstrip()

'hello      world  \n'

In [547]:
s.rstrip()

'       hello      world'

In [564]:
# to strip unwanted middle space
pattern = re.compile(r'\s+')
pattern.sub(" ", s.strip())

'hello world'

In [560]:
" ".join(re.findall(r"\w+", s))


'hello world'

## 2.12 Sanitizing and cleaning up text


In [566]:
s = 'python\fis\tawesome\r\n'
s

'python\x0cis\tawesome\r\n'

In [568]:
remap = {
    ord('\t'): ' ', 
    ord('\f'): ' ', 
    ord('\r'): None
}
s.translate(remap)

'python is awesome\n'

In [569]:
def clean_spaces(s):
    s = s.replace("\r", " ")
    s = s.replace("\t", " ")
    s = s.replace("\f", " ")
    return s

In [570]:
clean_spaces(s)

'python is awesome \n'

# 2.13) Aligning text strings


In [572]:
text = "hello world"
text.ljust(20)

'hello world         '

In [574]:
text.rjust(20)

'         hello world'

In [575]:
text.center(20)

'    hello world     '

In [576]:
text.rjust(20, '=')



In [577]:
text.center(20, "*")

'****hello world*****'

In [578]:
format(text, '>20')

'         hello world'

In [579]:
format(text, '<20')

'hello world         '

In [580]:
format(text, '^20')

'    hello world     '

In [587]:
format(text, 'u>20')

'uuuuuuuuuhello world'

In [588]:
format(text, '*^20')

'****hello world*****'

In [591]:
'{:u>20} {:*>10s}'.format('hello', 'world')

'uuuuuuuuuuuuuuuhello *****world'

In [592]:
x = 1.234
format(x, '>10')

'     1.234'

In [593]:
format(x, '^20')

'       1.234        '

In [596]:
format(x, '^10.2f')

'   1.23   '

In [597]:
'%-20s' % text

'hello world         '

## 2.15 Interpolating variables in strings

In [599]:
s = '{name} has {n} messages'
s.format(name='Guido', n=44)

'Guido has 44 messages'

In [600]:
name = 'Guido'
n = 37
s.format_map(vars())

'Guido has 37 messages'

In [601]:
class Info:
    def __init__(self, name, n):
        self.name = name
        self.n = n
a = Info('guido', 56)
s.format_map(vars(a))

'guido has 56 messages'

In [602]:
s.format(name='guido')

KeyError: 'n'

In [603]:
class safesub(dict):
    def __missing__(self, key):
        return '{' + key + '}'
del n


In [604]:
s.format_map(safesub(vars()))

'Guido has {n} messages'

In [605]:
import sys
def sub(text):
    return text.format_map(safesub(sys._getframe(1).f_locals))
name = 'Guido'
n = 37
print(sub("Hello {name}"))

Hello Guido


In [606]:
print(sub("you have {n} messages"))

you have 37 messages


In [608]:
print(sub("your favorite color is {color}"))

your favorite color is {color}


In [609]:
print(sub("but  you have {n} birthdays"))

but  you have 37 birthdays


In [610]:
import string
s = string.Template('$name has $n messages')
s.substitute(vars())

'Guido has 37 messages'

## 2.18 tokenizing text

In [612]:
# suppose we have a string that you want to parse left to right into a stream of tokens

In [631]:
text = 'foo = 23 + 42 * 10'
tokens = [("NAME", 'foo'), ("EQ", '='), ("NUM", '23'), ("PLUS", '+'), 
         ('NUM', '42'), ('TIMES', '*'), ('NUM', '10')]

In [704]:
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\+)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'
master_pattern = re.compile("|".join([NAME, NUM, PLUS, TIMES, EQ, WS]))

In [691]:
scanner = master_pattern.scanner("foo = 42")

In [693]:
for m in iter(scanner.match, None):
    print(m)

<_sre.SRE_Match object; span=(0, 3), match='foo'>
<_sre.SRE_Match object; span=(3, 4), match=' '>
<_sre.SRE_Match object; span=(4, 5), match='='>
<_sre.SRE_Match object; span=(5, 6), match=' '>
<_sre.SRE_Match object; span=(6, 8), match='42'>


In [653]:
_.lastgroup, _.group()

('NAME', 'foo')

In [654]:
scanner.match()

<_sre.SRE_Match object; span=(3, 4), match=' '>

In [655]:
_.lastgroup, _.group()

('WS', ' ')

In [656]:
scanner.match()

<_sre.SRE_Match object; span=(4, 5), match='='>

In [657]:
_.lastgroup, _.group()

('EQ', '=')

In [658]:
scanner.match()

<_sre.SRE_Match object; span=(5, 6), match=' '>

In [659]:
_.lastgroup, _.group()

('WS', ' ')

In [660]:
scanner.match()

<_sre.SRE_Match object; span=(6, 8), match='42'>

In [661]:
_.lastgroup, _.group()

('NUM', '42')

In [675]:
from collections import namedtuple
Token = namedtuple("Token", ['type', 'value'])

def generate_tokens(pat, text):
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        yield Token(m.lastgroup, m.group())

for tok in generate_tokens(master_pattern, 'foo = 42 + 100'):
    print(tok)

Token(type='NAME', value='foo')
Token(type='WS', value=' ')
Token(type='EQ', value='=')
Token(type='WS', value=' ')
Token(type='NUM', value='42')
Token(type='WS', value=' ')
Token(type='PLUS', value='+')
Token(type='WS', value=' ')
Token(type='NUM', value='100')


In [711]:
toks = generate_tokens(master_pattern, 'foo = 42 + 100')
t = None
val = None
t,val = val, next(toks)

In [712]:
t,val

(None, Token(type='NAME', value='foo'))

In [None]:

NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\+)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'

# 2.19 Writing a Recursive Parser

In [769]:
# Token specification
NUM = R'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
MINUS = r'(?P<MINUS>-)'
TIMES = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
WS = r'(?P<WS>\s+)'
master_pattern = re.compile("|".join([NUM, PLUS, MINUS, TIMES, DIVIDE, LPAREN, RPAREN, WS]))
# TOKENIZER
Token = namedtuple("Token", ['type', 'value'])
def generate_tokens(pat, text):
    scanner = pat.scanner(text)
    for m in iter(scanner.match, None):
        t = Token(m.lastgroup, m.group())
        if t.type != 'WS':
            yield t

# Parser
class ExpressionEvaluator:
    """Implementation of a recursive parser."""
    def __init__(self, pattern):
        self.pat = pattern
        self.token = self.create_token()
    def parse(self, text):
        self.tokens = self.generate_tokens(text)
        self.tok = None
        self.nexttok = None
        self._advance()
        return self.expr()
    
    def _advance(self):
        self.tok, self.nexttok = self.nexttok, next(self.tokens, None)
    
    def create_token(self):
        return namedtuple("Token", ['type', 'value'])
    
    def generate_tokens(self, text):
        scanner = self.pat.scanner(text)
        for m in iter(scanner.match, None):
            t = self.token(m.lastgroup, m.group())
            if t.type != 'WS':
                yield t
    
    def _accept(self, toktype):
        if self.nexttok and self.nexttok.type == toktype:
            self._advance()
            return True
        else:
            return False
    def _expect(self, toktype):
        if not self._accept(toktype):
            raise SyntaxError("Expected " + toktype)
    
    # grammer rules
    
    def expr(self):
        "expression ::= term { ('+'|'-') term}*"
        exprval = self.term()
        if self._accept("PLUS") or self._accept("MINUS"):
            op = self.tok.type
            right = self.term()
            if op == 'PLUS':
                exprval += right
            elif op == 'MINUS':
                exprval -= right
        return exprval
    
    def term(self):
        "term ::= factor {('*' | '/') factor}*"
        termval = self.factor()
        if self._accept("TIMES") or self._accept("DIVIDE"):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval *= right
            elif op == 'DIVIDE':
                termval /= right
        return termval
    
    def factor(self):
        "factor ::= NUM | (expr)"
        if self._accept("NUM"):
            return int(self.tok.value)
        elif self._accept("LPAREN"):
            exprval = self.expr()
            self._expect("RPAREN")
            return exprval
        else:
            raise SyntaxError("Expected Number or LPAREN")
    
    

In [770]:
ee = ExpressionEvaluator(master_pattern)

In [771]:
ee.parse('2')

2

In [772]:
ee.parse("2 + 3")

5

In [773]:
 # testinig
ee.tokens = ee.generate_tokens("2 + 3 * 4")

In [755]:
ee.tok, ee.nexttok = None, None

In [762]:
ee._advance()

In [763]:
ee.nexttok

Token(type='TIMES', value='*')

In [751]:
# testing accept function
if ee.nexttok and ee.nexttok.type == 'NUM':
    print(ee.nexttok.value)
# i see whats happening, beautifully written

2


In [752]:
ee.parse("2 + 3 + 4")

9

In [774]:
ee.parse("2 + 3 * 4")

14

In [775]:
ee.parse("2 + (3 + 4) * 5")

37

In [776]:
ee.parse("2 + (3 + * 4)") # t his will result inan error

SyntaxError: Expected Number or LPAREN (<string>)

In [777]:
# performing text operations on byte string

In [778]:
data = b'Hello World'
data

b'Hello World'

In [779]:
data.startswith(b'Hello')

True

In [780]:
data.split()

[b'Hello', b'World']

In [781]:
data = bytearray(b'Hello World')

In [784]:
data.startswith(b'Hello')

True

In [785]:
data.split()

[bytearray(b'Hello'), bytearray(b'World')]

In [787]:
data.replace(b"Hello", b"Hello Cruel")

bytearray(b'Hello Cruel World')

In [788]:
data = b'FOO:BAR,SPAM'
re.split("[:,]", data)

TypeError: cannot use a string pattern on a bytes-like object

In [789]:
re.split(b'[:,]', data)

[b'FOO', b'BAR', b'SPAM']

In [791]:
data[4] # indexing in byte string produces integers

66

In [792]:
b = b'Hello world'
print(b) # observe the 'b'

b'Hello world'


In [793]:
print(b.decode("ascii"))

Hello world


In [803]:
'{:10s} {:10d} {:10.2f}'.format("ACME", 100, 490.1).encode("ascii")

b'ACME              100     490.10'

In [802]:
format("hello world", '=>20s')

