# Class notes

November 11, 2024

Demos of XPath

## First, review ingesting XML

In [2]:
import xml.etree.ElementTree as ET

In [3]:
from lxml import etree

In [4]:
tree = ET.parse('../data/xml/superior-papers-complete.xml')
root = tree.getroot()

In [5]:
type(root)

xml.etree.ElementTree.Element

In [6]:
root.tag

'{http://ead3.archivists.org/schema/}ead'

In [7]:
ns = {
    'ead': 'http://ead3.archivists.org/schema/'
}

In [8]:
for element in root:
    print(element.tag)

{http://ead3.archivists.org/schema/}control
{http://ead3.archivists.org/schema/}archdesc


In [9]:
for element in root.iter():
    print(element.tag)

{http://ead3.archivists.org/schema/}ead
{http://ead3.archivists.org/schema/}control
{http://ead3.archivists.org/schema/}recordid
{http://ead3.archivists.org/schema/}filedesc
{http://ead3.archivists.org/schema/}titlestmt
{http://ead3.archivists.org/schema/}titleproper
{http://ead3.archivists.org/schema/}titleproper
{http://ead3.archivists.org/schema/}author
{http://ead3.archivists.org/schema/}publicationstmt
{http://ead3.archivists.org/schema/}publisher
{http://ead3.archivists.org/schema/}address
{http://ead3.archivists.org/schema/}addressline
{http://ead3.archivists.org/schema/}addressline
{http://ead3.archivists.org/schema/}addressline
{http://ead3.archivists.org/schema/}date
{http://ead3.archivists.org/schema/}num
{http://ead3.archivists.org/schema/}p
{http://ead3.archivists.org/schema/}maintenancestatus
{http://ead3.archivists.org/schema/}maintenanceagency
{http://ead3.archivists.org/schema/}agencycode
{http://ead3.archivists.org/schema/}agencyname
{http://ead3.archivists.org/schema

In [10]:
type(tree)

xml.etree.ElementTree.ElementTree

In [11]:
type(root)

xml.etree.ElementTree.Element

In [12]:
archdesc = root.find('ead:archdesc', ns)

for element in archdesc:
    print(element.tag)

{http://ead3.archivists.org/schema/}did
{http://ead3.archivists.org/schema/}controlaccess
{http://ead3.archivists.org/schema/}dsc


In [13]:
type(archdesc)

xml.etree.ElementTree.Element

## XPath Basics

Start with `*` and `.` and `\\`:

In [14]:
allEadTags = root.findall('*')

for element in allEadTags:
    print(element.tag, element.attrib)

{http://ead3.archivists.org/schema/}control {'countryencoding': 'iso3166-1', 'dateencoding': 'iso8601', 'langencoding': 'iso639-2b', 'relatedencoding': 'marc', 'repositoryencoding': 'iso15511', 'scriptencoding': 'iso15924'}
{http://ead3.archivists.org/schema/}archdesc {'level': 'collection'}


In [17]:
allTags = root.findall('.//')

for element in allEadTags:
    print(element.tag, element.attrib)

{http://ead3.archivists.org/schema/}control {'countryencoding': 'iso3166-1', 'dateencoding': 'iso8601', 'langencoding': 'iso639-2b', 'relatedencoding': 'marc', 'repositoryencoding': 'iso15511', 'scriptencoding': 'iso15924'}
{http://ead3.archivists.org/schema/}recordid {'instanceurl': 'Reading Room & website'}
{http://ead3.archivists.org/schema/}filedesc {}
{http://ead3.archivists.org/schema/}titlestmt {}
{http://ead3.archivists.org/schema/}titleproper {}
{http://ead3.archivists.org/schema/}titleproper {'localtype': 'filing'}
{http://ead3.archivists.org/schema/}author {}
{http://ead3.archivists.org/schema/}publicationstmt {}
{http://ead3.archivists.org/schema/}publisher {}
{http://ead3.archivists.org/schema/}address {}
{http://ead3.archivists.org/schema/}addressline {}
{http://ead3.archivists.org/schema/}addressline {}
{http://ead3.archivists.org/schema/}addressline {'localtype': 'email'}
{http://ead3.archivists.org/schema/}date {}
{http://ead3.archivists.org/schema/}num {}
{http://ead3

In [None]:
allEadTags = root.findall('.//ead:*', ns)

for element in allEadTags:
    print(element.tag)

{http://ead3.archivists.org/schema/}control
{http://ead3.archivists.org/schema/}recordid
{http://ead3.archivists.org/schema/}filedesc
{http://ead3.archivists.org/schema/}titlestmt
{http://ead3.archivists.org/schema/}titleproper
{http://ead3.archivists.org/schema/}titleproper
{http://ead3.archivists.org/schema/}author
{http://ead3.archivists.org/schema/}publicationstmt
{http://ead3.archivists.org/schema/}publisher
{http://ead3.archivists.org/schema/}address
{http://ead3.archivists.org/schema/}addressline
{http://ead3.archivists.org/schema/}addressline
{http://ead3.archivists.org/schema/}addressline
{http://ead3.archivists.org/schema/}date
{http://ead3.archivists.org/schema/}num
{http://ead3.archivists.org/schema/}p
{http://ead3.archivists.org/schema/}maintenancestatus
{http://ead3.archivists.org/schema/}maintenanceagency
{http://ead3.archivists.org/schema/}agencycode
{http://ead3.archivists.org/schema/}agencyname
{http://ead3.archivists.org/schema/}languagedeclaration
{http://ead3.archi

In [19]:
allEadTagswithLevels = root.findall('.//ead:*[@level]', ns)

for element in allEadTagswithLevels:
    print(element.tag, element.attrib)

{http://ead3.archivists.org/schema/}archdesc {'level': 'collection'}
{http://ead3.archivists.org/schema/}c {'id': 'aspace_edab5fb678b5e37fcf34130da5836686', 'level': 'series'}
{http://ead3.archivists.org/schema/}c {'id': 'aspace_cf40ab6e8beda6e2fdb51e0a08d30e06', 'level': 'series'}
{http://ead3.archivists.org/schema/}c {'id': 'aspace_5f14153a5cfd7f41c0b74cb59f429490', 'level': 'series'}
{http://ead3.archivists.org/schema/}c {'id': 'aspace_0d77dcd4a4330f180ac372a192c51427', 'level': 'file'}


In [None]:
allEadTagswithLevelSeries = root.findall('.//ead:*[@level="series"]', ns)

for element in allEadTagswithLevelSeries:
    print(element.tag, element.attrib)

{http://ead3.archivists.org/schema/}c {'id': 'aspace_edab5fb678b5e37fcf34130da5836686', 'level': 'series'}
{http://ead3.archivists.org/schema/}c {'id': 'aspace_cf40ab6e8beda6e2fdb51e0a08d30e06', 'level': 'series'}
{http://ead3.archivists.org/schema/}c {'id': 'aspace_5f14153a5cfd7f41c0b74cb59f429490', 'level': 'series'}


In [None]:
allEadTagswithLevelSeriesUnitID = root.findall('.//ead:c[@level="series"]//ead:unitid', ns)

for element in allEadTagswithLevelSeriesUnitID:
    print(element.tag, element.attrib)

{http://ead3.archivists.org/schema/}unitid {'localtype': 'aspace_uri'}
{http://ead3.archivists.org/schema/}unitid {}
{http://ead3.archivists.org/schema/}unitid {'localtype': 'aspace_uri'}
{http://ead3.archivists.org/schema/}unitid {}
{http://ead3.archivists.org/schema/}unitid {'localtype': 'aspace_uri'}
{http://ead3.archivists.org/schema/}unitid {}
{http://ead3.archivists.org/schema/}unitid {'localtype': 'aspace_uri'}
{http://ead3.archivists.org/schema/}unitid {}


In [25]:
allEadTagswithLevelSeriesUnitID = root.findall('.//ead:c[@level="series"]//ead:unitid[@localtype="aspace_uri"]', ns)

for element in allEadTagswithLevelSeriesUnitID:
    print(element.tag, element.attrib, element.text)

{http://ead3.archivists.org/schema/}unitid {'localtype': 'aspace_uri'} /repositories/2/archival_objects/3
{http://ead3.archivists.org/schema/}unitid {'localtype': 'aspace_uri'} /repositories/2/archival_objects/4
{http://ead3.archivists.org/schema/}unitid {'localtype': 'aspace_uri'} /repositories/2/archival_objects/1
{http://ead3.archivists.org/schema/}unitid {'localtype': 'aspace_uri'} /repositories/2/archival_objects/2
