# Class Demos for Oct 10

Basic functions of XML in Python, using ElementTree & `lxml`... 

In [None]:
#pip install lxml

In [1]:
from pathlib import Path
try:
    from lxml import etree
    print("running with lxml.etree")
except ImportError:
    import xml.etree.ElementTree as etree
    print("running with Python's xml.etree.ElementTree")

running with lxml.etree


## Basic Functions

Like: how to read an XML file in Python

In [2]:
finding_aid_path = Path('..', 'data', 'xml', 'superior-papers-demo.xml')

In [3]:
if finding_aid_path.is_file():
    print('It\'s a file!')

It's a file!


In [4]:
tree = etree.parse(finding_aid_path)

In [5]:
print(tree)

<lxml.etree._ElementTree object at 0x106c131c0>


In [6]:
root = tree.getroot()

print(root[:250])

[<Element {http://ead3.archivists.org/schema/}control at 0x106cae9c0>, <Element {http://ead3.archivists.org/schema/}archdesc at 0x106cafcc0>]


Now you can look through the XML almost like a list or dictionary:

In [7]:
for element in root:
    print(element)

<Element {http://ead3.archivists.org/schema/}control at 0x106c94900>
<Element {http://ead3.archivists.org/schema/}archdesc at 0x106c97800>


In [8]:
for element in root:
    print(type(element), element.tag, element.text)

<class 'lxml.etree._Element'> {http://ead3.archivists.org/schema/}control 
        
<class 'lxml.etree._Element'> {http://ead3.archivists.org/schema/}archdesc 
        


In [9]:
for element in root:
    print(element.tag, element.attrib)

{http://ead3.archivists.org/schema/}control {'countryencoding': 'iso3166-1', 'dateencoding': 'iso8601', 'langencoding': 'iso639-2b'}
{http://ead3.archivists.org/schema/}archdesc {'level': 'collection', 'audience': 'external'}


Other ways to look through the tree: `.iter()`

In [10]:
for element in root.iter():
    print(element.tag, element.attrib)

{http://ead3.archivists.org/schema/}ead {'audience': 'external'}
{http://ead3.archivists.org/schema/}control {'countryencoding': 'iso3166-1', 'dateencoding': 'iso8601', 'langencoding': 'iso639-2b'}
{http://ead3.archivists.org/schema/}recordid {'instanceurl': 'http://jajohnst.si.umich.edu/fake-ead.xml'}
{http://ead3.archivists.org/schema/}filedesc {}
{http://ead3.archivists.org/schema/}titlestmt {}
{http://ead3.archivists.org/schema/}titleproper {}
{http://ead3.archivists.org/schema/}publicationstmt {}
{http://ead3.archivists.org/schema/}publisher {}
{http://ead3.archivists.org/schema/}date {'normal': '2022-09-01'}
{http://ead3.archivists.org/schema/}archdesc {'level': 'collection', 'audience': 'external'}
{http://ead3.archivists.org/schema/}did {}
{http://ead3.archivists.org/schema/}repository {}
{http://ead3.archivists.org/schema/}corpname {}
{http://ead3.archivists.org/schema/}part {}
{http://ead3.archivists.org/schema/}part {}
{http://ead3.archivists.org/schema/}unittitle {}
{http:/

In [12]:
control = tree.find('{http://ead3.archivists.org/schema/}control')
print(control.attrib)

{'countryencoding': 'iso3166-1', 'dateencoding': 'iso8601', 'langencoding': 'iso639-2b'}


In [13]:
language = control.get('langencoding')
print(language)

iso639-2b


In [14]:
for element in control:
    print(element.tag, element.text)

{http://ead3.archivists.org/schema/}recordid 1234
{http://ead3.archivists.org/schema/}filedesc 
            


## Working with Namespaces

:)

In [15]:
ns = {
    'ead': 'http://ead3.archivists.org/schema/',
    'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
    'dcterms': 'http://purl.org/dc/terms/',
    'xsd': 'http://www.w3.org/2001/XMLSchema#'
}

In [19]:
archdesc = root.find(f'{{{ns['ead']}}}archdesc')
print(archdesc.attrib)

{'level': 'collection', 'audience': 'external'}


## Writing some XML

:tada: with the `.write()` function

In [20]:
metadata = etree.Element(f'{{{ns['ead']}}}ead', nsmap=ns)

In [21]:
print(metadata)

<Element {http://ead3.archivists.org/schema/}ead at 0x10704f7c0>


In [22]:
# build subelements
control = etree.SubElement(metadata, f'{{{ns['ead']}}}control')
control.set('countryencoding','iso3166-1')

recordid = etree.SubElement(control, f'{{{ns['ead']}}}recordid')
recordid.text = '1234'

In [23]:
etree.tostring(metadata, pretty_print=True)

b'<ead:ead xmlns:ead="http://ead3.archivists.org/schema/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:xsd="http://www.w3.org/2001/XMLSchema#">\n  <ead:control countryencoding="iso3166-1">\n    <ead:recordid>1234</ead:recordid>\n  </ead:control>\n</ead:ead>\n'

In [27]:
fpath_new_xml = Path('..','data', 'xml', 'new_xml_basic.xml')

print(etree.tostring(metadata, pretty_print=True).decode())

<ead:ead xmlns:ead="http://ead3.archivists.org/schema/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:xsd="http://www.w3.org/2001/XMLSchema#">
  <ead:control countryencoding="iso3166-1">
    <ead:recordid>1234</ead:recordid>
  </ead:control>
</ead:ead>

