# Video Lectures

## Unicode

In [1]:
import unicodedata

In [4]:
def unicode_test(value):
    name = unicodedata.name(value)
    print("value=%s, name=%s" % (value, name))

In [5]:
unicode_test("B")

value=B, name=LATIN CAPITAL LETTER B


In [6]:
unicode_test('\u0042')  # unicode in hex

value=B, name=LATIN CAPITAL LETTER B


In [11]:
unicode_test('\N{LATIN CAPITAL LETTER B}')

value=B, name=LATIN CAPITAL LETTER B


In [8]:
print("This string has a \u000A newline character in the middle")

This string has a 
 newline character in the middle


In [9]:
print("This string has a \n newline character in the middle")

This string has a 
 newline character in the middle


In [15]:
place = 'caf\u00e9'
print(place)

café


In [17]:
len(place)  # for strings, counts the number of unicode characters

4

## Encoding with UTF-8

### Strings

In [18]:
raw_char = '\u2603'

In [20]:
print(raw_char)

☃


In [21]:
len(raw_char)

1

In [24]:
enconded_char = raw_char.encode('utf-8')
print(enconded_char)

b'\xe2\x98\x83'


Python stores in memory the the value of enconded_char as E29883

In [26]:
len(enconded_char)

3

In [27]:
type(enconded_char)

bytes

In [28]:
type(raw_char)

str

In [30]:
ascii_char = 'A'
print(ascii_char.encode('ascii'))

b'A'


In [31]:
type(ascii_char)

str

In [32]:
len(ascii_char)

1

'A' in ascii is \x41 in unicode

In [33]:
snowman_char = '\u2603'
print(snowman_char.encode('ascii'))

UnicodeEncodeError: 'ascii' codec can't encode character '\u2603' in position 0: ordinal not in range(128)

## Handling encoding errors

### Ignore characters that are not part of the encoding

In [34]:
snowman_char.encode('ascii', 'ignore')

b''

### Replace characters not part of the encoding with "?"

In [35]:
snowman_char.encode('ascii', 'replace')

b'?'

### Escape characters not part of the encoding

In [36]:
snowman_char.encode('ascii', 'backslashreplace')

b'\\u2603'

### Produce a XML-friendly string

In [37]:
snowman_char.encode('ascii', 'xmlcharrefreplace')

b'&#9731;'

## Decoding

In [38]:
my_nag = "I can't wait for more heat so that there are no more \u2603 \u2603"

In [40]:
print(my_nag)
type(my_nag)

I can't wait for more heat so that there are no more ☃ ☃


str

In [41]:
my_nag_bytes = my_nag.encode('utf-8')
print(my_nag_bytes)
type(my_nag_bytes)

b"I can't wait for more heat so that there are no more \xe2\x98\x83 \xe2\x98\x83"


bytes

In [42]:
my_nag_decoded = my_nag_bytes.decode('utf-8')
print(my_nag_decoded)
type(my_nag_decoded)

I can't wait for more heat so that there are no more ☃ ☃


str

In [43]:
my_nag_decoded_wrong = my_nag_bytes.decode('latin-1')
print(my_nag_decoded_wrong)

I can't wait for more heat so that there are no more â â


In [45]:
my_nag_decoded_wrong = my_nag_bytes.decode('ascii')
print(my_nag_decoded_wrong)

UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 53: ordinal not in range(128)

## Formatting

In [46]:
import datetime

In [49]:
today = datetime.datetime.now().strftime('%A')
print(today)

Tuesday


### Old style with %

In [50]:
students = []
students.append({"name":"Max Powers", "email":"max@gmail.com", "midterm": 87.76, "final":88.65, "grade":"B+"})
students.append({"name":"Julie Thompson", "email":"julie@outlook.com", "midterm": 93.43, "final":90.45, "grade":"A-"})
students.append({"name":"Amber Francis", "email":"amber@gmail.com", "midterm": 85.23, "final":97.54, "grade":"A-"})
students.append({"name":"Andrew Smith", "email":"andrew@yahoo.com", "midterm": 87.43, "final":80.32, "grade":"B"})

In [55]:
for student in students:
    student_data = ("%s %s %f %f %s" %
                        (
                            student['name'],
                            student['email'],
                            student['midterm'],
                            student['final'],
                            student['grade'],
                        )
                    )

    print(student_data)

Max Powers max@gmail.com 87.760000 88.650000 B+
Julie Thompson julie@outlook.com 93.430000 90.450000 A-
Amber Francis amber@gmail.com 85.230000 97.540000 A-
Andrew Smith andrew@yahoo.com 87.430000 80.320000 B


In [56]:
for student in students:
    student_data = ("%20s %20s %10f %10f %5s" %
                        (
                            student['name'],
                            student['email'],
                            student['midterm'],
                            student['final'],
                            student['grade'],
                        )
                    )

    print(student_data)

          Max Powers        max@gmail.com  87.760000  88.650000    B+
      Julie Thompson    julie@outlook.com  93.430000  90.450000    A-
       Amber Francis      amber@gmail.com  85.230000  97.540000    A-
        Andrew Smith     andrew@yahoo.com  87.430000  80.320000     B


In [57]:
for student in students:
    student_data = ("%-20s %-20s %-10f %-10f %-5s" %
                        (
                            student['name'],
                            student['email'],
                            student['midterm'],
                            student['final'],
                            student['grade'],
                        )
                    )

    print(student_data)

Max Powers           max@gmail.com        87.760000  88.650000  B+   
Julie Thompson       julie@outlook.com    93.430000  90.450000  A-   
Amber Francis        amber@gmail.com      85.230000  97.540000  A-   
Andrew Smith         andrew@yahoo.com     87.430000  80.320000  B    


In [60]:
for student in students:
    student_data = ("%-20s %-20s %-5.1f %-5.1f %-5s" %
                        (
                            student['name'],
                            student['email'],
                            student['midterm'],
                            student['final'],
                            student['grade'],
                        )
                    )

    print(student_data)

Max Powers           max@gmail.com        87.8  88.7  B+   
Julie Thompson       julie@outlook.com    93.4  90.5  A-   
Amber Francis        amber@gmail.com      85.2  97.5  A-   
Andrew Smith         andrew@yahoo.com     87.4  80.3  B    


### New style with {} and format

In [61]:
for student in students:
    student_data = (
        "{} {} {} {} {}".format(
            student['name'],
            student['email'],
            student['midterm'],
            student['final'],
            student['grade'],
        )
    )
    print(student_data)

Max Powers max@gmail.com 87.76 88.65 B+
Julie Thompson julie@outlook.com 93.43 90.45 A-
Amber Francis amber@gmail.com 85.23 97.54 A-
Andrew Smith andrew@yahoo.com 87.43 80.32 B


In [63]:
for student in students:
    student_data = (
        "{name} {email} {midterm} {final} {grade}".format(
            midterm = student['midterm'],
            final = student['final'],
            name = student['name'],
            grade = student['grade'],
            email = student['email'],
        )
    )
    print(student_data)

Max Powers max@gmail.com 87.76 88.65 B+
Julie Thompson julie@outlook.com 93.43 90.45 A-
Amber Francis amber@gmail.com 85.23 97.54 A-
Andrew Smith andrew@yahoo.com 87.43 80.32 B


In [64]:
for student in students:
    student_data = (
        "{name:s} {email:s} {midterm:f} {final:f} {grade:s}".format(
            midterm = student['midterm'],
            final = student['final'],
            name = student['name'],
            grade = student['grade'],
            email = student['email'],
        )
    )
    print(student_data)

Max Powers max@gmail.com 87.760000 88.650000 B+
Julie Thompson julie@outlook.com 93.430000 90.450000 A-
Amber Francis amber@gmail.com 85.230000 97.540000 A-
Andrew Smith andrew@yahoo.com 87.430000 80.320000 B


In [65]:
for student in students:
    student_data = (
        "{name:<20s} {email:<20s} {midterm:<5f} {final:<5f} {grade:<5s}".format(
            midterm = student['midterm'],
            final = student['final'],
            name = student['name'],
            grade = student['grade'],
            email = student['email'],
        )
    )
    print(student_data)

Max Powers           max@gmail.com        87.760000 88.650000 B+   
Julie Thompson       julie@outlook.com    93.430000 90.450000 A-   
Amber Francis        amber@gmail.com      85.230000 97.540000 A-   
Andrew Smith         andrew@yahoo.com     87.430000 80.320000 B    


In [66]:
for student in students:
    student_data = (
        "{name:<20s} {email:<20s} {midterm:<5.1f} {final:<5.1f} {grade:<5s}".format(
            midterm = student['midterm'],
            final = student['final'],
            name = student['name'],
            grade = student['grade'],
            email = student['email'],
        )
    )
    print(student_data)

Max Powers           max@gmail.com        87.8  88.7  B+   
Julie Thompson       julie@outlook.com    93.4  90.5  A-   
Amber Francis        amber@gmail.com      85.2  97.5  A-   
Andrew Smith         andrew@yahoo.com     87.4  80.3  B    


## Regular Expressions

In [1]:
import re

In [3]:
source = "To be or not to be, that is the question."
pattern = 'To be'

In [4]:
result = re.match(pattern, source)
print(result)

<re.Match object; span=(0, 5), match='To be'>


In [9]:
# compiling a pattern for regexp use
compiled_pattern = re.compile('To be')
print(compiled_pattern)

re.compile('To be')


In [10]:
result_compiled = compiled_pattern.match(source)
print(result_compiled)

<re.Match object; span=(0, 5), match='To be'>


### Exact match with match()

In [11]:
m = compiled_pattern.match(source)

if m:
    print(m.group())

To be


In [14]:
middle_pattern = re.compile('that is')
m = middle_pattern.match(source)

if m:
    print(m.group())
else:
    print('Not found')

Not found


In [16]:
# match looks at the begging of the source

middle_pattern_with_wildcard = re.compile('.*that is')
m = middle_pattern_with_wildcard.match(source)

if m:
    print(m.group())
else:
    print('Not found')

To be or not to be, that is


### First match with search()

In [18]:
middle_pattern = re.compile('that is')
m = middle_pattern.search(source)

if m:
    print(m.group())
else:
    print('Not found')

that is


### All matches with findall()

In [21]:
n_pattern = re.compile('n')
m = n_pattern.findall(source)

if m:
    print('Found', len(m), 'matches.')
    print(m)
else:
    print('Not found')

Found 2 matches.
['n', 'n']


In [25]:
# find all n values that have any character after
n_and_char_pattern = re.compile('n.')
m = n_and_char_pattern.findall(source)
print('Found', len(m), 'matches.')
print(m)

Found 2 matches.
['no', 'n.']


In [27]:
# find all n values that have or not any character after
n_and_char_optional_pattern = re.compile('n.?')
m = n_and_char_optional_pattern.findall(source)
print('Found', len(m), 'matches.')
print(m)

Found 2 matches.
['no', 'n.']


### Split at matches with split()

In [28]:
n_pattern = re.compile('n')
m = n_pattern.split(source)
print(m)

['To be or ', 'ot to be, that is the questio', '.']


### Replace at matches with sub()

In [29]:
m = n_pattern.sub('?', source)
print(m)

To be or ?ot to be, that is the questio?.


### Defining patterns

In [32]:
import string
printable = string.printable
print(printable)

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	



In [34]:
re.findall('\d', printable)  # matches a single digit

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [35]:
re.findall('\w', printable)  # matches any alphanumerica characters and underscore

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '_']

In [36]:
# matche all white space
re.findall('\s', printable)

[' ', '\t', '\n', '\r', '\x0b', '\x0c']

### Using specifiers

Special characters that allow you to capture values using regexp.

In [37]:
large_source = """
Hi Bianca,
It was great to talk to you about regular expressions. I really understand
them more than I ever had before. Would you like to work on the next project
together? My number is 650-555-3948. Thanks and talk to you soon!

-Mary
"""

In [42]:
# find phone numbers in a text blob
# raw string literal avoids using python special characters
phone_number_pattern = re.compile(r'[0-9]{3}-[0-9]{3}-[0-9]{4}')
m = phone_number_pattern.findall(large_source)
print(m)

['650-555-3948']


In [43]:
phone_number_pattern = re.compile(r'\d{3}-\d{3}-\d{4}')
m = phone_number_pattern.findall(large_source)
print(m)

['650-555-3948']


### Specifying match output

In [68]:
phone_number_pattern = re.compile(r'(\d{3})-(\d{3}-\d{4})')
m = phone_number_pattern.search(large_source)

if m:
    print(m.group())
    print(m.groups())

650-555-3948
('650', '555-3948')


In [70]:
# naming groups
phone_number_pattern = re.compile(r'(?P<areacode>\d{3})-(?P<number>\d{3}-\d{4})')
m = phone_number_pattern.search(large_source)

if m:
    print(m.group('areacode'))
    print(m.group('number'))

650
555-3948


## Binary data

### Bytes and bytearray

Bytes are inmutable<br>Bytearrays are mutable

In [72]:
byte_value_list = [4, 3, 242]
byte_test = bytes(byte_value_list)
print(byte_test)

b'\x04\x03\xf2'


In [75]:
byte_test[1] = 129

TypeError: 'bytes' object does not support item assignment

Transforming hexa to decimal:<br>
\xf2 =>  (15 * 16) + 2 = 242

In [74]:
bytes_array_test = bytearray(byte_value_list)
print(bytes_array_test)

bytearray(b'\x04\x03\xf2')


In [76]:
bytes_array_test[1] = 127
print(bytes_array_test)

bytearray(b'\x04\x7f\xf2')


In [77]:
bytes_array_test[2] = 68
print(bytes_array_test)

bytearray(b'\x04\x7fD')


### Convert binary with struct

In [81]:
import struct

f = open('cal-image.png', 'rb')

try:
    data = f.read(24)
finally:
    f.close()
    
png_header = b'\x89PNG\r\n\x1a\n'

if data[0:8] == png_header:
    width, height = struct.unpack('>LL', data[16:24])
    print('Valid PNG, width', width, 'height', height)
else:
    print('Not a valid PNG')

Valid PNG, width 500 height 398


### Convert byte/strings with bianascii()

In [83]:
import binascii
png_header_in_hex = binascii.hexlify(png_header)
print(png_header_in_hex)

b'89504e470d0a1a0a'


In [84]:
print(binascii.unhexlify(png_header_in_hex))

b'\x89PNG\r\n\x1a\n'


## File Input and Output

file_handle = open(path_to_file, mode)

In [122]:
contents = '''To be or not to be, 
that is the question.'''

In [123]:
poem_file = open('shakespeare.txt', 'wt')
poem_file.write(contents)
poem_file.close()

In [119]:
!ls

8.1 - Encoding Text.pptx           Notepad - Wk 8.ipynb
8.10 - Structured Text Files.ipynb Week 8 Assignment.ipynb
8.3 - Encoding Text.pptx           binary
8.4 - Unicode Strings.ipynb        cal-image.png
8.5 - Encoding.ipynb               grades.csv
8.6 - Formatting.ipynb             grades.json
8.7 - Regular Expressions.ipynb    grades.xml
8.8 - Binary Data.ipynb            grades_python.json
8.9 - File Input and Output.ipynb  shakespeare.txt


In [124]:
!cat shakespeare.txt

To be or not to be, 
that is the question.

In [125]:
author = '\n --Written by Shakespeare'

In [126]:
# append more text
poem_file = open('shakespeare.txt', 'at')
poem_file.write(author)
poem_file.close()

In [127]:
!cat shakespeare.txt

To be or not to be, 
that is the question.
 --Written by Shakespeare

In [128]:
poem_file_read = open('shakespeare.txt', 'rt')
pr = poem_file_read.read()
poem_file_read.close()
print(pr)

To be or not to be, 
that is the question.
 --Written by Shakespeare


In [129]:
poem_file_read = open('shakespeare.txt', 'rt')
pr = poem_file_read.readlines()
poem_file_read.close()
print(pr)

['To be or not to be, \n', 'that is the question.\n', ' --Written by Shakespeare']


In [130]:
print(pr[0])

To be or not to be, 



In [131]:
for line in pr:
    print(line)

To be or not to be, 

that is the question.

 --Written by Shakespeare


In [132]:
# readlines reads the entire document, readline reads specific lines
poem_file_read = open('shakespeare.txt', 'rt')

while True:
    line = poem_file_read.readline()
    if not line:
        break
    print(line)

poem_file_read.close()

To be or not to be, 

that is the question.

 --Written by Shakespeare


In [133]:
poem_file_read = open('shakespeare.txt', 'rt')

for line in poem_file_read:
    print(line)
    
poem_file_read.close()

To be or not to be, 

that is the question.

 --Written by Shakespeare


In [134]:
with open('shakespeare.txt', 'rt') as f:
    for line in f:
        print(line)

To be or not to be, 

that is the question.

 --Written by Shakespeare


### Reading and writing binary files

In [135]:
with open('binary', 'rb') as binary_file_read:
    print('First 5 bytes of the file: ', binary_file_read.read(5))
    print('Second 5 bytes of the file: ', binary_file_read.read(5))

First 5 bytes of the file:  b'\x00\x01\x02\x03\x04'
Second 5 bytes of the file:  b'\x05\x06\x07\x08\t'


In [136]:
bin_file = open('binary', 'rb')
bin_file.tell()

0

In [137]:
bin_file.seek(65)

65

In [138]:
bin_file.tell()

65

In [139]:
bin_file.read(1)

b'A'

In [140]:
bin_file.seek(0)
bin_file.read(10)

b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t'

In [141]:
bin_file.tell()

10

In [146]:
bin_file.seek(0)

0

In [147]:
bin_file.seek(65,1 )
bin_file.read(1)

b'A'

In [153]:
bin_file.seek(19)
bin_file.read(1)

b'\x13'

In [154]:
bin_file.seek(45,1)
bin_file.read(1)

b'A'

## Structured Text Files

### Comma-Separated Value (CSV)

In [157]:
import csv

grades = [
    ['John', 88],
    ['Kate', 93],
    ['Harry', 83],
    ['Linda', 87],
    ['Harriet', 91],
]

grades_csv_write = open('grades.csv', 'wt')
csv_out = csv.writer(grades_csv_write)
csv_out.writerows(grades)
grades_csv_write.close()

In [158]:
!cat grades.csv

John,88
Kate,93
Harry,83
Linda,87
Harriet,91


In [161]:
grades_csv_read = open('grades.csv', 'rt')
csv_in = csv.reader(grades_csv_read)

for row in csv_in:
    print(row)
    
grades_csv_read.close()

['John', '88']
['Kate', '93']
['Harry', '83']
['Linda', '87']
['Harriet', '91']


### eXtensible Markup Language (XML)

In [162]:
xml_data = '''<?xml version="1.0"?>
<students>
	<student name="John">
		<grade value="88" />
	</student>
	<student name="Kate">
		<grade value="93" />
	</student>
	<student name="Harry">
		<grade value="93" />
	</student>
	<student name="Linda">
		<grade value="87" />
	</student>
	<student name="Harriet">
		<grade value="91" />
	</student>
</students>'''

xml_data_file = open('grades.xml', 'wt')
xml_data_file.write(xml_data)
xml_data_file.close()

In [163]:
!cat grades.xml

<?xml version="1.0"?>
<students>
	<student name="John">
		<grade value="88" />
	</student>
	<student name="Kate">
		<grade value="93" />
	</student>
	<student name="Harry">
		<grade value="93" />
	</student>
	<student name="Linda">
		<grade value="87" />
	</student>
	<student name="Harriet">
		<grade value="91" />
	</student>
</students>

In [165]:
from xml.etree import ElementTree

tree = ElementTree.ElementTree(file='grades.xml')
root = tree.getroot()
print(root.tag)

students


In [167]:
for child in root:
    print('tag:', child.tag, 'attribute:', child.attrib)

tag: student attribute: {'name': 'John'}
tag: student attribute: {'name': 'Kate'}
tag: student attribute: {'name': 'Harry'}
tag: student attribute: {'name': 'Linda'}
tag: student attribute: {'name': 'Harriet'}


In [168]:
for child in root:
    print('tag:', child.tag, 'attribute:', child.attrib)
    
    for grandchild in child:
        print('\ttag:', grandchild.tag, 'attribute:', grandchild.attrib)

tag: student attribute: {'name': 'John'}
	tag: grade attribute: {'value': '88'}
tag: student attribute: {'name': 'Kate'}
	tag: grade attribute: {'value': '93'}
tag: student attribute: {'name': 'Harry'}
	tag: grade attribute: {'value': '93'}
tag: student attribute: {'name': 'Linda'}
	tag: grade attribute: {'value': '87'}
tag: student attribute: {'name': 'Harriet'}
	tag: grade attribute: {'value': '91'}


### JS Object Notation (JSON)

In [170]:
json_data = '''{
	"students": {
		
		"John": {
			"grades": [88]
		},

		"Kate": {
			"grades": [93]
		},

		"Harry": {
			"grades": [93]
		},

		"Linda": {
			"grades": [87]
		},

		"Harriet": {
			"grades": [91]
		}
	}
}'''

json_data_file = open('grades.json', 'wt')
json_data_file.write(json_data)
json_data_file.close()

In [171]:
!cat grades.json

{
	"students": {
		
		"John": {
			"grades": [88]
		},

		"Kate": {
			"grades": [93]
		},

		"Harry": {
			"grades": [93]
		},

		"Linda": {
			"grades": [87]
		},

		"Harriet": {
			"grades": [91]
		}
	}
}

In [174]:
import json

json_data_file = open('grades.json', 'rt')
json_data = json.loads(json_data_file.read())
json_data_file.close()

print('root:', json_data)
print()
print("students:", json_data['students'])

root: {'students': {'John': {'grades': [88]}, 'Kate': {'grades': [93]}, 'Harry': {'grades': [93]}, 'Linda': {'grades': [87]}, 'Harriet': {'grades': [91]}}}

students: {'John': {'grades': [88]}, 'Kate': {'grades': [93]}, 'Harry': {'grades': [93]}, 'Linda': {'grades': [87]}, 'Harriet': {'grades': [91]}}


In [181]:
python_dict = {
    'students': {
        'Harriet': {'grade': 91},
        'John': {'grade': 88},
        'Kate': {'grade': 93},
        'Linda': {'grade': 87},
        'Harry': {'grade': 93},        
    }
}

In [182]:
python_dict_json = json.dumps(python_dict)

python_dict_json_file = open('grades_python.json', 'wt')
python_dict_json_file.write(python_dict_json)
python_dict_json_file.close()

In [183]:
!cat grades_python.json

{"students": {"Harriet": {"grade": 91}, "John": {"grade": 88}, "Kate": {"grade": 93}, "Linda": {"grade": 87}, "Harry": {"grade": 93}}}

# Book Exercises

7.1. Create a Unicode string called mystery and assign it the value '\U0001f4a9'. Print mystery. Look up the Unicode name for mystery.

In [190]:
import unicodedata

mystery = '\U0001f4a9'
print(mystery)

print(unicodedata.name(mystery))

💩
PILE OF POO


7.2. Encode mystery, this time using UTF-8, into the bytes variable pop_bytes. Print pop_bytes.

In [192]:
pop_bytes = mystery.encode('utf-8')
print(pop_bytes)

b'\xf0\x9f\x92\xa9'


7.3. Using UTF-8, decode pop_bytes into the string variable pop_string. Print pop_string. Is pop_string equal to mystery?

In [195]:
pop_string = pop_bytes.decode('utf-8')
print(pop_string)

pop_string == mystery

💩


True

7.4. Write the following poem by using old-style formatting. Substitute the strings 'roast beef', 'ham', 'head', and 'clam' into this string:

My kitty cat likes %s,
My kitty cat likes %s,
My kitty cat fell on his %s
And now thinks he's a %s.

In [204]:
poem ='''My kitty cat likes %s, 
My kitty cat likes %s, 
My kitty cat fell on his %s 
And now thinks he's a %s.'''
print(poem)
args = ('roast beef', 'ham', 'head', 'clam')
print(args)

print(poem % args)

My kitty cat likes %s, 
My kitty cat likes %s, 
My kitty cat fell on his %s 
And now thinks he's a %s.
('roast beef', 'ham', 'head', 'clam')
My kitty cat likes roast beef, 
My kitty cat likes ham, 
My kitty cat fell on his head 
And now thinks he's a clam.


7.5. Write a form letter by using new-style formatting. Save the following string as letter (you’ll use it in the next exercise):

In [205]:
letter = '''
Dear {salutation} {name},

Thank you for your letter. We are sorry that our {product} {verb} in your
{room}. Please note that it should never be used in a {room}, especially
near any {animals}.

Send us your receipt and {amount} for shipping and handling. We will send
you another {product} that, in our tests, is {percent}% less likely to
have {verbed}.

Thank you for your support.

Sincerely,
{spokesman}
{job_title}
'''

7.6. Make a dictionary called response with values for the string keys 'salutation', 'name', 'product', 'verbed' (past tense verb), 'room', 'animals', 'percent', 'spokesman', and 'job_title'. Print letter with the values from response.

In [214]:
responses_dc = {
    'salutation': 'Don', 
    'name': 'Juan', 
    'product': 'razor',
    'verb': 'exploding',
    'verbed': 'exploded' , 
    'room': 'bathroom', 
    'animals': 'cat', 
    'amount': '$666',
    'percent': '90', 
    'spokesman': 'Jose',
    'job_title': 'Chief Storyteller'
}

print(letter.format(**responses_dc))


Dear Don Juan,

Thank you for your letter. We are sorry that our razor exploding in your
bathroom. Please note that it should never be used in a bathroom, especially
near any cat.

Send us your receipt and $666 for shipping and handling. We will send
you another razor that, in our tests, is 90% less likely to
have exploded.

Thank you for your support.

Sincerely,
Jose
Chief Storyteller



7.7. When you’re working with text, regular expressions come in very handy. We’ll apply them in a number of ways to our featured text sample. It’s a poem titled “Ode on the Mammoth Cheese,” written by James McIntyre in 1866 in homage to a seven-thousand-pound cheese that was crafted in Ontario and sent on an international tour. If you’d rather not type all of it, use your favorite search engine and cut and paste the words into your Python program. Or, just grab it from Project Gutenberg. Call the text string mammoth.

In [218]:
mammoth = '''
We have seen thee, queen of cheese,
Lying quietly at your ease,
Gently fanned by evening breeze,
Thy fair form no flies dare seize.

All gaily dressed soon you'll go
To the great Provincial show,
To be admired by many a beau
In the city of Toronto.

Cows numerous as a swarm of bees,
Or as the leaves upon the trees,
It did require to make thee please,
And stand unrivalled, queen of cheese.

May you not receive a scar as
We have heard that Mr. Harris
Intends to send you off as far as
The great world's show at Paris.

Of the youth beware of these,
For some of them might rudely squeeze
And bite your cheek, then songs or glees
We could not sing, oh! queen of cheese.

We'rt thou suspended from balloon,
You'd cast a shade even at noon,
Folks would think it was the moon
About to fall and crush them soon.
'''

7.8 Import the re module to use Python’s regular expression functions. Use re.findall() to print all the words that begin with 'c'.
We’ll define the variable pat for the pattern and then search for it in mammoth:

In [226]:
import re

pat = re.compile(r'\bc\w+')
re.findall(pat, mammoth)

['cheese', 'city', 'cheese', 'cheek', 'could', 'cheese', 'cast', 'crush']

7.9 Find all four-letter words that begin with c.

In [233]:
pat = re.compile(r'\bc\w{3}\b')
re.findall(pat, mammoth)

['city', 'cast']

7.10. Find all the words that end with r.

In [237]:
pat = re.compile(r'\w+r\b')
re.findall(pat, mammoth)

['your', 'fair', 'Or', 'scar', 'Mr', 'far', 'For', 'your', 'or']

Find all words that end with l, dealing with apostrophes

In [238]:
pat = re.compile(r'\w+l\b')
re.findall(pat, mammoth)

['All', 'll', 'Provincial', 'fall']

In [241]:
pat = re.compile(r'[\w\']+l\b')
re.findall(pat, mammoth)

['All', "you'll", 'Provincial', 'fall']

7.11. Find all the words that contain exactly three vowels in a row.

In [257]:
# get characters before a
pat = re.compile(r'\w*[aA]')
re.findall(pat, mammoth)

['ha',
 'a',
 'ea',
 'fa',
 'fa',
 'da',
 'A',
 'ga',
 'grea',
 'Provincia',
 'a',
 'ma',
 'a',
 'bea',
 'a',
 'a',
 'swa',
 'a',
 'lea',
 'ma',
 'plea',
 'A',
 'sta',
 'unriva',
 'Ma',
 'a',
 'sca',
 'a',
 'ha',
 'hea',
 'tha',
 'Ha',
 'a',
 'fa',
 'a',
 'grea',
 'a',
 'Pa',
 'bewa',
 'A',
 'ba',
 'ca',
 'a',
 'sha',
 'a',
 'wa',
 'A',
 'fa',
 'a']

In [262]:
# match all characters before and after three consecutive vowels
# excluding words with more vowels ingnoring spaces
pat = re.compile(r'\w*[aeiouAEIOU]{3}[^aeiouAEIOU\s]*\w*')
re.findall(pat, mammoth)

['queen', 'quietly', 'beau', 'queen', 'squeeze', 'queen']

7.12. Use unhexlify() to convert this hex string (combined from two strings to fit on a page) to a bytes variable called gif:

In [267]:
hex_string = '47494638396101000100800000000000ffffff21f9' \
+ '0401000000002c000000000100010000020144003b'

In [268]:
import binascii
gif = binascii.unhexlify(hex_string)
print(hex_string)
print(gif)

47494638396101000100800000000000ffffff21f90401000000002c000000000100010000020144003b
b'GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff!\xf9\x04\x01\x00\x00\x00\x00,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x01D\x00;'


7.13. The bytes in gif define a one-pixel transparent GIF file, one of the most common graphics file formats. A legal GIF starts with the string GIF89a. Does gif match this?

In [276]:
gif[:6] == 'GIF89a'

False

In [281]:
gif_start = 'GIF89a'
gif_start_bytes = gif_start.encode('utf-8')

In [282]:
print(gif_start_bytes)
print(gif[:6])
gif_start_bytes == gif[:6]

b'GIF89a'
b'GIF89a'


True

7.14. The pixel width of a GIF is a 16-bit big-endian integer starting at byte offset 6, and the height is the same size, starting at offset 8. Extract and print these values for gif. Are they both 1?

In [287]:
print('width:', gif[8])
print('height:', gif[6])

width: 1
height: 1


8.1. Assign the string 'This is a test of the emergency text system' to the variable test1, and write test1 to a file called test.txt.

In [306]:
test1 =  'This is a test of the emergency text system'
out_file = open('test.txt', 'wt')
out_file.write(test1)
out_file.close()

In [307]:
!cat test.txt

This is a test of the emergency text system

8.2. Open the file test.txt and read its contents into the string test2. Are test1 and test2 the same?

In [308]:
in_file = open('test.txt', 'rt')
test2 = in_file.read()
in_file.close()
print(test2)

This is a test of the emergency text system


In [296]:
test1 == test2

True

8.3. Save these text lines to a file called test.csv. Notice that if the fields are separated by commas, you need to surround a field with quotes if it contains a comma.


In [309]:
text = '''author, book
J R R Tolkien, The Hobbit
Lynne Truss, "Eats, Shoots & Leaves"'''

In [310]:
print(text)

author, book
J R R Tolkien, The Hobbit
Lynne Truss, "Eats, Shoots & Leaves"


In [332]:
out_csv_file = open('test.csv', 'wt')
out_csv_file.write(text)
out_csv_file.close()

In [331]:
!cat test.csv

author, book
J R R Tolkien, The Hobbit
Lynne Truss, "Eats, Shoots & Leaves"

8.4. Use the csv module and its DictReader() method to read test.csv to the variable books. Print the values in books. Did DictReader() handle the quotes and commas in the second book’s title?

>>> import csv
>>> with open('names.csv', newline='') as csvfile:
...     reader = csv.DictReader(csvfile)
...     for row in reader:
...         print(row['first_name'], row['last_name'])
...
Eric Idle
John Cleese

>>> print(row)
{'first_name': 'John', 'last_name': 'Cleese'}

In [338]:
import csv
in_csv_file = open('test.csv', 'rt')
csv_reader = csv.DictReader(
                                in_csv_file, 
                                quoting=csv.QUOTE_ALL, 
                                skipinitialspace=True,
                                delimiter=','
                            )
for row in csv_reader:
    print(row)
in_csv_file.close()

OrderedDict([('author', 'J R R Tolkien'), ('book', 'The Hobbit')])
OrderedDict([('author', 'Lynne Truss'), ('book', 'Eats, Shoots & Leaves')])


8.5. Create a CSV file called books.csv by using these lines:

In [340]:
text = '''title,author,year
The Weirdstone of Brisingamen,Alan Garner,1960
Perdido Street Station,China Miéville,2000
Thud!,Terry Pratchett,2005
The Spellman Files,Lisa Lutz,2007
Small Gods,Terry Pratchett,1992
'''
print(text)

title,author,year
The Weirdstone of Brisingamen,Alan Garner,1960
Perdido Street Station,China Miéville,2000
Thud!,Terry Pratchett,2005
The Spellman Files,Lisa Lutz,2007
Small Gods,Terry Pratchett,1992



In [346]:
out_csv_file = open('books.csv', 'wt')
out_csv_file.write(text)
out_csv_file.close()

In [347]:
!cat books.csv

title,author,year
The Weirdstone of Brisingamen,Alan Garner,1960
Perdido Street Station,China Miéville,2000
Thud!,Terry Pratchett,2005
The Spellman Files,Lisa Lutz,2007
Small Gods,Terry Pratchett,1992
