# Class demos October 23, 2023

# Python and XML with ET

Reference: https://github.com/morskyjezek/networked-services-labs-2023/blob/main/xml-intro-basic-functions-ET.ipynb

In [3]:
import xml.etree.ElementTree as ET 

In [5]:
tree = ET.parse('../data/xml/superior-papers-demo.xml')

In [6]:
type(tree)

xml.etree.ElementTree.ElementTree

In [7]:
root = tree.getroot()

In [8]:
root.tag

'{http://ead3.archivists.org/schema/}ead'

In [9]:
ns = {
    'ead': 'http://ead3.archivists.org/schema/'
}

In [12]:
for element in root.find('{http://ead3.archivists.org/schema/}control'):
    print(element.tag)

{http://ead3.archivists.org/schema/}control
{http://ead3.archivists.org/schema/}archdesc


In [13]:
for element in root.find('{http://ead3.archivists.org/schema/}control'):
    print(element.tag)

{http://ead3.archivists.org/schema/}recordid
{http://ead3.archivists.org/schema/}filedesc


In [14]:
for element in root.find('ead:control', ns):
    print(element.tag)

{http://ead3.archivists.org/schema/}recordid
{http://ead3.archivists.org/schema/}filedesc


In [16]:
control = root.find('ead:control', ns)
print(control.tag, control.attrib)

{http://ead3.archivists.org/schema/}control {'countryencoding': 'iso3166-1', 'dateencoding': 'iso8601', 'langencoding': 'iso639-2b'}


In [17]:
control.text 

'\n        '

In [21]:
for element in control:
    print(element.tag, element.attrib, element.text)

{http://ead3.archivists.org/schema/}recordid {'instanceurl': 'http://jajohnst.si.umich.edu/fake-ead.xml'} 1234
{http://ead3.archivists.org/schema/}filedesc {} 
            


In [23]:
for element in root.iter():
    print(element.tag)

{http://ead3.archivists.org/schema/}ead
{http://ead3.archivists.org/schema/}control
{http://ead3.archivists.org/schema/}recordid
{http://ead3.archivists.org/schema/}filedesc
{http://ead3.archivists.org/schema/}titlestmt
{http://ead3.archivists.org/schema/}titleproper
{http://ead3.archivists.org/schema/}publicationstmt
{http://ead3.archivists.org/schema/}publisher
{http://ead3.archivists.org/schema/}date
{http://ead3.archivists.org/schema/}archdesc
{http://ead3.archivists.org/schema/}did
{http://ead3.archivists.org/schema/}repository
{http://ead3.archivists.org/schema/}corpname
{http://ead3.archivists.org/schema/}part
{http://ead3.archivists.org/schema/}part
{http://ead3.archivists.org/schema/}unittitle
{http://ead3.archivists.org/schema/}title
{http://ead3.archivists.org/schema/}unitdate
{http://ead3.archivists.org/schema/}bioghist
{http://ead3.archivists.org/schema/}dsc
{http://ead3.archivists.org/schema/}c01
{http://ead3.archivists.org/schema/}c02
{http://ead3.archivists.org/schema/}

In [26]:
for element in root.iter('{http://ead3.archivists.org/schema/}part'):
    print(element.tag, element.attrib, element.text)

{http://ead3.archivists.org/schema/}part {} University of Michigan
{http://ead3.archivists.org/schema/}part {} School of Information


In [27]:
for element in root.iter('{http://ead3.archivists.org/schema/}unitdate'):
    print(element.tag, element.attrib, element.text)

{http://ead3.archivists.org/schema/}unitdate {'normal': '1850/1975', 'unitdatetype': 'inclusive'} 1850-1975


In [10]:
titleproper = root.find('ead:control/ead:filedesc/ead:titlestmt/ead:titleproper', ns)
print(titleproper.tag)

{http://ead3.archivists.org/schema/}titleproper


In [37]:
for title in root.iter('{http://ead3.archivists.org/schema/}titleproper'):
    print(title.tag, title.text)

{http://ead3.archivists.org/schema/}titleproper A Finding Aid for the Superior Papers


In [38]:
titleproper = root.find('{http://ead3.archivists.org/schema/}titleproper')

print(titleproper)

None


## Writing out XML to a file

In [39]:
# establish a prefix
ET.register_namespace('ead', 'http://ead3.archivists.org/schema/')
ET.register_namespace('dcterms', 'http://purl.org/dc/elements/1.1/')

In [40]:
tree.write('data/xml/superior-papers-revised.xml', xml_declaration=True, encoding='utf-8', method='xml')

In [41]:
root.tag

'{http://ead3.archivists.org/schema/}ead'

In [42]:
control = root.find('{http://ead3.archivists.org/schema/}control')

In [43]:
control.tag

'{http://ead3.archivists.org/schema/}control'

In [44]:
control.attrib

{'countryencoding': 'iso3166-1',
 'dateencoding': 'iso8601',
 'langencoding': 'iso639-2b'}

In [45]:
control.set('language', 'en-US')
control.attrib

{'countryencoding': 'iso3166-1',
 'dateencoding': 'iso8601',
 'langencoding': 'iso639-2b',
 'language': 'en-US'}

In [46]:
tree.write('data/xml/superior-papers-revised.xml', xml_declaration=True, encoding='utf-8', method='xml')
print('wrote the file')

wrote the file


# File management in Python (conclusion)

* getting datetime
* create checksum
* write out to CSV

Reference: https://github.com/morskyjezek/networked-services-labs-2023/blob/main/python-file-management-FULL.ipynb

In [11]:
import os 
from datetime import datetime 
import csv 

In [12]:
path_to_file = '../data/emails/mbox-short.txt'

In [13]:
os.path.isfile(path_to_file)

True

In [14]:
os.stat(path_to_file)

os.stat_result(st_mode=33188, st_ino=28064473, st_dev=16777229, st_nlink=1, st_uid=504, st_gid=20, st_size=94625, st_atime=1694411544, st_mtime=1663731214, st_ctime=1694411358)

In [15]:
stat_info = os.stat(path_to_file)

In [18]:
stat_info.st_mtime

1663731214.6876292

In [20]:
modify_time = datetime.strftime(datetime.fromtimestamp(stat_info.st_mtime), "%Y-%m-%dT%H:%M:%S%Z")

In [21]:
print(modify_time)

2022-09-20T23:33:34


In [24]:
create_time = datetime.strftime(datetime.fromtimestamp(stat_info.st_ctime), "%Y-%m-%dT%H:%M:%S%Z")

In [25]:
print(create_time)

2023-09-11T01:49:18


## create a checksum

In [26]:
import hashlib 

In [27]:
def get_checksum(filePath, checksum_type):
    checksum_type = checksum_type.lower()

    with open(filePath, 'rb') as f: 
        bytes = f.read()
        if checksum_type == 'md5':
            hash_string = hashlib.md5(bytes).hexdigest()
    return hash_string

In [28]:
get_checksum(path_to_file, 'md5')

'b30c720633214d72186ccfdee629ea58'

In [29]:
file_hash_md5 = get_checksum(path_to_file, 'md5')

In [30]:
file_hash_md5

'b30c720633214d72186ccfdee629ea58'

In [31]:
headers = [
    'filename',
    'folder_path',
    'size',
    'modify_time',
    'checksum'
]

In [34]:
oneFile = {
    'filename': path_to_file,
    'folder_path': path_to_file,
    'size': stat_info.st_size,
    'modify_time': modify_time,
    'checksum': get_checksum(path_to_file, 'md5')
}

In [36]:
oneFilelist = [
    path_to_file,
    path_to_file,
    stat_info.st_size,
    modify_time,
    get_checksum(path_to_file, 'md5')
]

In [37]:
oneFilelist

['../data/emails/mbox-short.txt',
 '../data/emails/mbox-short.txt',
 94625,
 '2022-09-20T23:33:34',
 'b30c720633214d72186ccfdee629ea58']

In [38]:
with open('file-manifest.csv', 'w') as f: 
    writer = csv.writer(f)
    print('writing file manifest...')
    writer.writerow(headers)
    writer.writerow(oneFilelist)
    print('wrote file manifest')

writing file manifest...
wrote file manifest


Note that in the demo notebook, the file information is in a list of lists. You can either
make a list of lists, where each list is the file information that you want, in the correct order; 
or, you can make a dictionary with each bit of file information connected to the fieldname. 
Then, use that iterable object to create the CSV (ie., you can go through it item by item, and write
out each item as a new entry [line] in your CSV file). 