Regular expressions

Examples

Patterns

Match and search functions

In [6]:
import re
s = "Doing things, going home, staying awake, sleeping later"
re.findall(r'\w+ing\b', s)

['Doing', 'going', 'staying', 'sleeping']

In [8]:
re.findall(r'[+-]?\d+', "23 + -24 = -1")

['23', '-24', '-1']

In [22]:
s = ("If I'm not in a hurry, then I should stay. " + 
     "On the other hand, if I leave, then I can sleep.")
re.findall(r'[Ii]f (.*), then', s)    # tries to match as many characters as possible

["I'm not in a hurry, then I should stay. On the other hand, if I leave"]

In [24]:
re.findall(r'[Ii]f ([^.]*), then', s)    # match everything but dot character

["I'm not in a hurry", 'I leave']

In [20]:
re.findall(r'[Ii]f (.*?), then', s)    # non-greedy version

["I'm not in a hurry", 'I leave']

Functions in the re module

In [29]:
import re
str = "She goes where she wants to, she's a sheriff."
newstr = re.sub(r'\b[Ss]he\b', 'he', str)
print(newstr)

he goes where he wants to, he's a sheriff.


In [39]:
import re
str = """He is a timelord.
He has a Tardis."""
newstr = re.sub(r'(\b[Hh]e\b)', r'\1 (The Doctor)', str, 1)
print(newstr)

He (The Doctor) is a timelord.
He has a Tardis.


Match object

In [44]:
mo = re.search(r'\d+ (\d+) \d+ (\d+)', 'first 123 45 67 890 last')

In [52]:
mo.groups()

('45', '890')

In [54]:
mo.group(0)

'123 45 67 890'

In [80]:
mo = re.search(r'first', 'first 123 45 67 890 last')
if mo:
    print("The string 'first' is present")

The string 'first' is present


Miscellaneous stuff

Exercise 2.1 (integers in brackets)

In [85]:
def integers_in_brackets(s):
    """Returns all integers enclosed in brackets from a given string"""
    import re
    # * over ? accomodates multiple whitespaces, () to only return that part
    return list(map(int, (re.findall(r'\[\s*([+-]?\d+)\s*\]', s))))

def main():
    result = integers_in_brackets(" afd [asd] [12 ] [a34] [ -43 ]tt [+12]xxx")
    print(result)

if __name__ == "__main__":
    main()

[12, -43, 12]


Basic file processing

In [90]:
"ä".encode("utf-8")

b'\xc3\xa4'

In [94]:
list("ä".encode("utf-8"))    # Show as a list of integers

[195, 164]

In [98]:
"a".encode("utf-8")

b'a'

Some common file object methods

In [116]:
f = open("basics.ipynb", "r")    # let's open this notebook file,
                                 # which is essentially a text file
                                 # so you can open it in a texteditor also

for i in range(5):               # and read the first five lines
    line = f.readline()
    print(f"Line {i}: {line}", end="")
f.close()

Line 0: {
Line 1:  "cells": [
Line 2:   {
Line 3:    "cell_type": "markdown",
Line 4:    "id": "226311c3-351b-4382-8c6a-5bd8a269ea73",


In [118]:
with open("basics.ipynb", "r") as f:    # the file will be automatically closed,
                                        # when the with block exits

    for i in range(5):
        line = f.readline()
        print(f"Line {i}: {line}", end="")

Line 0: {
Line 1:  "cells": [
Line 2:   {
Line 3:    "cell_type": "markdown",
Line 4:    "id": "226311c3-351b-4382-8c6a-5bd8a269ea73",


In [120]:
max_len = 0
with open("basics.ipynb", "r") as f:
    for line in f:    # iterates through all the lines in the file
        if len(line) > max_len:
            max_len = len(line)
print(f"The longest line in this file has length {max_len}")

The longest line in this file has length 49


Standard file objects

In [123]:
import sys
import random
i = random.randint(-10, 10)
if i >= 0:
    sys.stdout.write("Got a positive integer or zero.\n")
else:
    sys.stderr.write("Got a negative integer.\n")

Got a negative integer.


sys module

Exercise 2.2 (file listing)

My solution:

In [139]:
import re


def file_listing(filename="listing.txt"):
    """Filters file and returns tuples for each line,
    each including size, month, day, hour, minute, and filename
    """
    result = []
    with open(filename, "r") as f:
        for line in f:
            size = re.findall(r'(\d+)', line)
            month = re.search(r'\b[A-Z][a-z][a-z]\b', line) 
            day = re.findall(r'\b\d+\b', line)
            hour = re.findall(r'\b(\d+):', line)
            minute = re.findall(r':(\d+)\b', line)
            # will take last continuous non-string
            filename = re.findall(r'\s(\S+)\b', line)
            tuple = (int(size[1]), month.group(0), int(day[2]),
                      int(hour[0]), int(minute[0]), filename[-1])
            result.append(tuple)
    return result

def main():
    print(file_listing())

if __name__ == "__main__":
    main()

[(2356, 'Dec', 11, 11, 50, 'add_colab_link.py'), (164519, 'Dec', 28, 17, 59, 'basics.ipynb'), (164477, 'Nov', 5, 19, 21, 'basics.ipynb.orig'), (115587, 'Dec', 11, 11, 50, 'bayes.ipynb'), (4096, 'Nov', 29, 13, 7, '_build'), (198820, 'Dec', 11, 11, 50, 'clustering.ipynb'), (6647, 'Dec', 11, 12, 20, 'conf.py'), (41828, 'Nov', 28, 13, 26, 'example_figure2.png'), (125079, 'Nov', 28, 13, 26, 'example_figure2.xcf'), (24139, 'Nov', 28, 12, 3, 'example_figure.png'), (650, 'Nov', 28, 12, 3, 'example_figure.py'), (25399, 'Nov', 2, 21, 25, 'exception_hierarchy.pdf'), (43632, 'Nov', 2, 22, 5, 'exception_hierarchy.png'), (24366, 'Nov', 2, 21, 26, 'exception_hierarchy.svg'), (72095, 'Oct', 3, 17, 36, 'extra.ipynb'), (1207075, 'Nov', 28, 16, 2, 'face.png'), (897, 'Nov', 12, 10, 59, 'generate_contents.py'), (4096, 'Dec', 27, 13, 55, '.git'), (19, 'Dec', 10, 10, 24, '.gitignore'), (890349, 'Dec', 11, 11, 50, 'image_processing.ipynb'), (689, 'Dec', 10, 10, 28, 'index.rst'), (5254, 'Nov', 3, 18, 46, 'inhe

Suggested solution:

In [153]:
import re
 
 
def file_listing(filename="listing.txt"):
    with open(filename) as f:
        lines = f.readlines()
    result=[]
    for line in lines:
        pattern = r".{10}\s+\d+\s+.+\s+.+\s+(\d+)\s+(...)\s+(\d+)\s+(\d\d):(\d\d)\s+(.+)"
        if True:      # Two alternative ways of doing the same thing
            m = re.match(pattern, line)
        else:
            compiled_pattern = re.compile(pattern)
            m = compiled_pattern.match(line)
        if m:
            t = m.groups()
            result.append((int(t[0]), t[1], int(t[2]), int(t[3]), int(t[4]), t[5]))
        else:
            print(line)
    return result
 
def main():
    tuples = file_listing()
    for t in tuples:
        print(t)
 
if __name__ == "__main__":
    main()

(2356, 'Dec', 11, 11, 50, 'add_colab_link.py')
(164519, 'Dec', 28, 17, 59, 'basics.ipynb')
(164477, 'Nov', 5, 19, 21, 'basics.ipynb.orig')
(115587, 'Dec', 11, 11, 50, 'bayes.ipynb')
(4096, 'Nov', 29, 13, 7, '_build')
(198820, 'Dec', 11, 11, 50, 'clustering.ipynb')
(6647, 'Dec', 11, 12, 20, 'conf.py')
(41828, 'Nov', 28, 13, 26, 'example_figure2.png')
(125079, 'Nov', 28, 13, 26, 'example_figure2.xcf')
(24139, 'Nov', 28, 12, 3, 'example_figure.png')
(650, 'Nov', 28, 12, 3, 'example_figure.py')
(25399, 'Nov', 2, 21, 25, 'exception_hierarchy.pdf')
(43632, 'Nov', 2, 22, 5, 'exception_hierarchy.png')
(24366, 'Nov', 2, 21, 26, 'exception_hierarchy.svg')
(72095, 'Oct', 3, 17, 36, 'extra.ipynb')
(1207075, 'Nov', 28, 16, 2, 'face.png')
(897, 'Nov', 12, 10, 59, 'generate_contents.py')
(4096, 'Dec', 27, 13, 55, '.git')
(19, 'Dec', 10, 10, 24, '.gitignore')
(890349, 'Dec', 11, 11, 50, 'image_processing.ipynb')
(689, 'Dec', 10, 10, 28, 'index.rst')
(5254, 'Nov', 3, 18, 46, 'inheritance_hierarchy.pdf'

Exercise 2.3 (red green blue)

My solution:

In [143]:
import re

def red_green_blue(filename="rgb.txt"):
    """Returns for each line the RGB format and color name
    """
    result = []

    # remove first line of file
    with open(filename, "r") as f:
        lines = f.readlines()
    del lines[0]
    with open(filename, "w") as f:
        f.writelines(lines)
    
    # clean file
    with open(filename, "r") as f:
        for line in f:
            rgb = re.findall(r'(\d+)\s', line)
            # grabs everything after the rgb numbers
            color = re.findall(r'\d+\s+\d+\s+\d+\s+(.*)', line)
            new = f"{rgb[0]}\t{rgb[1]}\t{rgb[2]}\t{color[0]}"
            result.append(new)
    return result

def main():
    print(red_green_blue())

if __name__ == "__main__":
    main()

['255\t250\t250\tsnow', '248\t248\t255\tghost white', '248\t248\t255\tGhostWhite', '245\t245\t245\twhite smoke', '245\t245\t245\tWhiteSmoke', '220\t220\t220\tgainsboro', '255\t250\t240\tfloral white', '255\t250\t240\tFloralWhite', '253\t245\t230\told lace', '253\t245\t230\tOldLace', '250\t240\t230\tlinen', '250\t235\t215\tantique white', '250\t235\t215\tAntiqueWhite', '255\t239\t213\tpapaya whip', '255\t239\t213\tPapayaWhip', '255\t235\t205\tblanched almond', '255\t235\t205\tBlanchedAlmond', '255\t228\t196\tbisque', '255\t218\t185\tpeach puff', '255\t218\t185\tPeachPuff', '255\t222\t173\tnavajo white', '255\t222\t173\tNavajoWhite', '255\t228\t181\tmoccasin', '255\t248\t220\tcornsilk', '255\t255\t240\tivory', '255\t250\t205\tlemon chiffon', '255\t250\t205\tLemonChiffon', '255\t245\t238\tseashell', '240\t255\t240\thoneydew', '245\t255\t250\tmint cream', '245\t255\t250\tMintCream', '240\t255\t255\tazure', '240\t248\t255\talice blue', '240\t248\t255\tAliceBlue', '230\t230\t250\tlavender', 

Suggested solution:

In [148]:
import re
 
def red_green_blue(filename="rgb.txt"):
    with open(filename) as in_file:
        l = re.findall(r"(\d+)\s+(\d+)\s+(\d+)\s+(.*)\n", in_file.read())
        return [
            "{}\t{}\t{}\t{}".format(r, g, b, name)
            for r, g, b, name
            in l
        ]
 
 
def main():
    lines = red_green_blue()
    for line in lines:
        print(line)
 
if __name__ == "__main__":
    main()

255	250	250	snow
248	248	255	ghost white
248	248	255	GhostWhite
245	245	245	white smoke
245	245	245	WhiteSmoke
220	220	220	gainsboro
255	250	240	floral white
255	250	240	FloralWhite
253	245	230	old lace
253	245	230	OldLace
250	240	230	linen
250	235	215	antique white
250	235	215	AntiqueWhite
255	239	213	papaya whip
255	239	213	PapayaWhip
255	235	205	blanched almond
255	235	205	BlanchedAlmond
255	228	196	bisque
255	218	185	peach puff
255	218	185	PeachPuff
255	222	173	navajo white
255	222	173	NavajoWhite
255	228	181	moccasin
255	248	220	cornsilk
255	255	240	ivory
255	250	205	lemon chiffon
255	250	205	LemonChiffon
255	245	238	seashell
240	255	240	honeydew
245	255	250	mint cream
245	255	250	MintCream
240	255	255	azure
240	248	255	alice blue
240	248	255	AliceBlue
230	230	250	lavender
255	240	245	lavender blush
255	240	245	LavenderBlush
255	228	225	misty rose
255	228	225	MistyRose
255	255	255	white
0	0	0	black
47	79	79	dark slate gray
47	79	79	DarkSlateGray
47	79	79	dark slate grey
47	79	79	D