In [1]:
import xml.etree.ElementTree as ET
import requests
from os.path import join

In [2]:
url = "https://oac.cdlib.org/mets/ark:/13030/tf238nb201/?brand=oac4"

In [3]:
r = requests.get(url)

r.status_code
r.headers

{'Server': 'CloudFront', 'Date': 'Mon, 25 Nov 2024 02:56:01 GMT', 'Content-Length': '0', 'Connection': 'keep-alive', 'x-amzn-waf-action': 'challenge', 'Cache-Control': 'no-store, max-age=0', 'Content-Type': 'text/html; charset=UTF-8', 'Access-Control-Allow-Origin': '*', 'Access-Control-Max-Age': '86400', 'Access-Control-Allow-Methods': 'OPTIONS,GET,POST', 'X-Cache': 'Error from cloudfront', 'Via': '1.1 b4fc286f293048277ad4f4edc41487aa.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'ORD58-P10', 'X-Amz-Cf-Id': 'byyAbyPtwktwPcGiwsbefQ_xnKvweSBbmzYxRzQIWono7fXsgEYd7w=='}

## Parse Sample METS

An OAC collection: https://oac.cdlib.org/findaid/ark:/13030/tf6b69p1kk/dsc/
From an OAC item: https://oac.cdlib.org/ark:/13030/tf238nb201/?brand=oac4 

Sample METS for the above item: https://oac.cdlib.org/mets/ark:/13030/tf238nb201/?brand=oac4 (also in course files, as linked below)

In [4]:
mets_path = join('..','data','xml','oac_tf238nb201_mets.xml')

In [5]:
mets_tree = ET.parse(mets_path).getroot()

In [6]:
type(mets_tree)

xml.etree.ElementTree.Element

In [7]:
mets_tree.tag

'{http://www.loc.gov/METS/}mets'

In [8]:
mets_tree.attrib

{'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.loc.gov/METS/  http://www.loc.gov/standards/mets/mets.xsd  http://www.loc.gov/standards/rights/METSRights.xsd  http://www.loc.gov/mods/v3/  http://www.loc.gov/standards/mods/v3/mods-3-4.xsd  http://www.loc.gov/mix/v10/  http://www.loc.gov/standards/mix/mix10/mix10.xsd  info:lc/xmlns/premis-v2/  http://www.loc.gov/standards/premis/premis.xsd',
 'OBJID': 'ark:/13030/tf238nb201',
 'LABEL': 'Cable spinning',
 'PROFILE': 'http://www.loc.gov/mets/profiles/00000013.xml',
 'TYPE': 'generic'}

Set up namespace dictionary

In [9]:
ns = {
    'mets':'http://www.loc.gov/METS/', 
    'mix':'http://www.loc.gov/mix/',
    'moa2':'http://sunsite.berkeley.edu/MOA2/',
    'cdl':'http://www.cdlib.org/',
    'xsi':'http://www.w3.org/2001/XMLSchema-instance',
    'premis':'http://www.loc.gov/standards/premis/v2',
    'xlink':'http://www.w3.org/1999/xlink',
    'rts':'http://cosimo.stanford.edu/sdr/metsrights/',
    'mods':'http://www.loc.gov/mods/v3'
}

## Explore the METS File

* Find `amdSec`, `dmdSec`, `fileSec`, `structMap`
* 

In [10]:
for element in mets_tree:
    print(element.tag, element.attrib)

{http://www.loc.gov/METS/}metsHdr {'CREATEDATE': '2009-05-26T02:15:02', 'LASTMODDATE': '2016-05-25T09:38:37'}
{http://www.loc.gov/METS/}dmdSec {'ID': 'DMR1'}
{http://www.loc.gov/METS/}dmdSec {'ID': 'DMR2'}
{http://www.loc.gov/METS/}dmdSec {'ID': 'DM1'}
{http://www.loc.gov/METS/}amdSec {}
{http://www.loc.gov/METS/}fileSec {}
{http://www.loc.gov/METS/}structMap {}


The `structMap` is the only required element. What's in it?

In [11]:
structMap = mets_tree.find('.//mets:structMap', ns)

for element in structMap:
    print(element.tag, element.attrib)

{http://www.loc.gov/METS/}div {'TYPE': 'item', 'LABEL': 'Cable spinning', 'ADMID': 'RMD1', 'DMDID': 'DMR1 DMR2 DM1'}


In [12]:
# display using `.tostring()`
print(ET.tostring(structMap))

b'<ns0:structMap xmlns:ns0="http://www.loc.gov/METS/" xmlns:ns1="http://www.cdlib.org/">\n <ns0:div TYPE="item" LABEL="Cable spinning" ADMID="RMD1" DMDID="DMR1 DMR2 DM1">\n   <ns0:div TYPE="image/master"><ns0:fptr FILEID="FID1" /></ns0:div>\n   <ns0:div TYPE="image/thumbnail"><ns0:fptr FILEID="thumbnail" ns1:X="192" ns1:Y="128" /></ns0:div>\n   <ns0:div TYPE="image/reference"><ns0:fptr FILEID="FID3" ns1:X="768" ns1:Y="512" /></ns0:div>\n   <ns0:div TYPE="image/reference"><ns0:fptr FILEID="FID4" ns1:X="1536" ns1:Y="1024" /></ns0:div>\n </ns0:div>\n</ns0:structMap>\n\n'


* look at the metsHdr

In [13]:
metsHdr = mets_tree.find('.//mets:metsHdr', ns)

print(ET.tostring(metsHdr))

b'<ns0:metsHdr xmlns:ns0="http://www.loc.gov/METS/" CREATEDATE="2009-05-26T02:15:02" LASTMODDATE="2016-05-25T09:38:37">\n  <ns0:agent ROLE="CREATOR" TYPE="ORGANIZATION">\n   <ns0:name>University of California, Berkeley::Bancroft Library</ns0:name>\n  </ns0:agent>\n<ns0:altRecordID>http://nma.berkeley.edu/ark:/13030/tf238nb201</ns0:altRecordID></ns0:metsHdr>\n\n'


* find all of the MODS fields

In [14]:
mods = mets_tree.findall('.//mods:*', namespaces=ns)

for element in mods:
    print(element.tag, element.attrib, '\n', element.text)

{http://www.loc.gov/mods/v3}mods {} 
 None
{http://www.loc.gov/mods/v3}titleInfo {} 
 
      
{http://www.loc.gov/mods/v3}title {} 
 Cable spinning
{http://www.loc.gov/mods/v3}typeOfResource {} 
 still image
{http://www.loc.gov/mods/v3}relatedItem {'displayLabel': 'Metacollection', 'type': 'host'} 
 
      
{http://www.loc.gov/mods/v3}titleInfo {} 
 
       
{http://www.loc.gov/mods/v3}title {} 
 California Heritage Collection
{http://www.loc.gov/mods/v3}identifier {'type': 'uri'} 
 http://bancroft.berkeley.edu/collections/calheritage.html
{http://www.loc.gov/mods/v3}relatedItem {'displayLabel': 'Collection', 'type': 'host'} 
 
      
{http://www.loc.gov/mods/v3}titleInfo {} 
 
       
{http://www.loc.gov/mods/v3}title {} 
 Construction Photographs of the Golden Gate Bridge
{http://www.loc.gov/mods/v3}identifier {'type': 'local'} 
 BANC PIC 1905.14251-.14284--PIC
{http://www.loc.gov/mods/v3}identifier {'type': 'uri'} 
 http://www.oac.cdlib.org/findaid/ark:/13030/tf6b69p1kk
{http://www.