In [65]:
import xml.etree.ElementTree as ET
import requests
from os.path import join

# if using lxml for string serialisation
from lxml import etree

In [2]:
url = "https://oac.cdlib.org/mets/ark:/13030/tf238nb201/?brand=oac4"

In [3]:
r = requests.get(url)

r.status_code
r.headers

{'Server': 'CloudFront', 'Date': 'Mon, 25 Nov 2024 02:56:01 GMT', 'Content-Length': '0', 'Connection': 'keep-alive', 'x-amzn-waf-action': 'challenge', 'Cache-Control': 'no-store, max-age=0', 'Content-Type': 'text/html; charset=UTF-8', 'Access-Control-Allow-Origin': '*', 'Access-Control-Max-Age': '86400', 'Access-Control-Allow-Methods': 'OPTIONS,GET,POST', 'X-Cache': 'Error from cloudfront', 'Via': '1.1 b4fc286f293048277ad4f4edc41487aa.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'ORD58-P10', 'X-Amz-Cf-Id': 'byyAbyPtwktwPcGiwsbefQ_xnKvweSBbmzYxRzQIWono7fXsgEYd7w=='}

## Parse Sample METS

An OAC collection: https://oac.cdlib.org/findaid/ark:/13030/tf6b69p1kk/dsc/
From an OAC item: https://oac.cdlib.org/ark:/13030/tf238nb201/?brand=oac4 

Sample METS for the above item: https://oac.cdlib.org/mets/ark:/13030/tf238nb201/?brand=oac4 (also in course files, as linked below)

In [4]:
mets_path = join('..','data','xml','oac_tf238nb201_mets.xml')

In [5]:
mets_tree = ET.parse(mets_path).getroot()

In [6]:
type(mets_tree)

xml.etree.ElementTree.Element

In [7]:
mets_tree.tag

'{http://www.loc.gov/METS/}mets'

In [8]:
mets_tree.attrib

{'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.loc.gov/METS/  http://www.loc.gov/standards/mets/mets.xsd  http://www.loc.gov/standards/rights/METSRights.xsd  http://www.loc.gov/mods/v3/  http://www.loc.gov/standards/mods/v3/mods-3-4.xsd  http://www.loc.gov/mix/v10/  http://www.loc.gov/standards/mix/mix10/mix10.xsd  info:lc/xmlns/premis-v2/  http://www.loc.gov/standards/premis/premis.xsd',
 'OBJID': 'ark:/13030/tf238nb201',
 'LABEL': 'Cable spinning',
 'PROFILE': 'http://www.loc.gov/mets/profiles/00000013.xml',
 'TYPE': 'generic'}

Set up namespace dictionary

In [9]:
ns = {
    'mets':'http://www.loc.gov/METS/', 
    'mix':'http://www.loc.gov/mix/',
    'moa2':'http://sunsite.berkeley.edu/MOA2/',
    'cdl':'http://www.cdlib.org/',
    'xsi':'http://www.w3.org/2001/XMLSchema-instance',
    'premis':'http://www.loc.gov/standards/premis/v2',
    'xlink':'http://www.w3.org/1999/xlink',
    'rts':'http://cosimo.stanford.edu/sdr/metsrights/',
    'mods':'http://www.loc.gov/mods/v3'
}

## Explore the METS File

* Find `amdSec`, `dmdSec`, `fileSec`, `structMap`
* 

In [10]:
for element in mets_tree:
    print(element.tag, element.attrib)

{http://www.loc.gov/METS/}metsHdr {'CREATEDATE': '2009-05-26T02:15:02', 'LASTMODDATE': '2016-05-25T09:38:37'}
{http://www.loc.gov/METS/}dmdSec {'ID': 'DMR1'}
{http://www.loc.gov/METS/}dmdSec {'ID': 'DMR2'}
{http://www.loc.gov/METS/}dmdSec {'ID': 'DM1'}
{http://www.loc.gov/METS/}amdSec {}
{http://www.loc.gov/METS/}fileSec {}
{http://www.loc.gov/METS/}structMap {}


In [73]:
# to use etree .tostring, must parse with etree parse?
mets_tree_lxml = etree.parse(mets_path).getroot()

print(mets_tree_lxml.tag)

{http://www.loc.gov/METS/}mets


The `structMap` is the only required element. What's in it?

In [74]:
structMap = mets_tree_lxml.find('.//mets:structMap', ns)

for element in structMap:
    print(element.tag, element.attrib)

{http://www.loc.gov/METS/}div {'TYPE': 'item', 'LABEL': 'Cable spinning', 'ADMID': 'RMD1', 'DMDID': 'DMR1 DMR2 DM1'}


In [75]:
# display using `.tostring()`
print(etree.tostring(structMap, encoding='utf-8', pretty_print=True).decode('utf-8'))

<mets:structMap xmlns:mets="http://www.loc.gov/METS/" xmlns="http://www.loc.gov/METS/" xmlns:mix="http://www.loc.gov/mix/" xmlns:moa2="http://sunsite.berkeley.edu/MOA2/" xmlns:cdl="http://www.cdlib.org/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:premis="http://www.loc.gov/standards/premis/v2" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:rts="http://cosimo.stanford.edu/sdr/metsrights/" xmlns:mods="http://www.loc.gov/mods/v3">
 <mets:div TYPE="item" LABEL="Cable spinning" ADMID="RMD1" DMDID="DMR1 DMR2 DM1">
   <mets:div TYPE="image/master"><mets:fptr FILEID="FID1"/></mets:div>
   <mets:div TYPE="image/thumbnail"><mets:fptr FILEID="thumbnail" cdl:X="192" cdl:Y="128"/></mets:div>
   <mets:div TYPE="image/reference"><mets:fptr FILEID="FID3" cdl:X="768" cdl:Y="512"/></mets:div>
   <mets:div TYPE="image/reference"><mets:fptr FILEID="FID4" cdl:X="1536" cdl:Y="1024"/></mets:div>
 </mets:div>
</mets:structMap>





* look at the metsHdr

In [77]:
metsHdr = mets_tree_lxml.find('.//mets:metsHdr', ns)

print(etree.tostring(metsHdr, encoding='utf-8', pretty_print=True).decode('utf-8'))

<mets:metsHdr xmlns:mets="http://www.loc.gov/METS/" xmlns="http://www.loc.gov/METS/" xmlns:mix="http://www.loc.gov/mix/" xmlns:moa2="http://sunsite.berkeley.edu/MOA2/" xmlns:cdl="http://www.cdlib.org/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:premis="http://www.loc.gov/standards/premis/v2" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:rts="http://cosimo.stanford.edu/sdr/metsrights/" xmlns:mods="http://www.loc.gov/mods/v3" CREATEDATE="2009-05-26T02:15:02" LASTMODDATE="2016-05-25T09:38:37">
  <mets:agent ROLE="CREATOR" TYPE="ORGANIZATION">
   <mets:name>University of California, Berkeley::Bancroft Library</mets:name>
  </mets:agent>
<altRecordID>http://nma.berkeley.edu/ark:/13030/tf238nb201</altRecordID></mets:metsHdr>





* look at `dmdSec` (note there may be multiple)

In [78]:
dmdSecs = mets_tree_lxml.findall('.//mets:dmdSec', ns)

print('found',len(dmdSecs),'dmdSecs','\n')

found 3 dmdSecs 



In [79]:
print(etree.tostring(dmdSecs[0], encoding='utf-8', pretty_print=True).decode('utf-8'))

<mets:dmdSec xmlns:mets="http://www.loc.gov/METS/" xmlns="http://www.loc.gov/METS/" xmlns:mix="http://www.loc.gov/mix/" xmlns:moa2="http://sunsite.berkeley.edu/MOA2/" xmlns:cdl="http://www.cdlib.org/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:premis="http://www.loc.gov/standards/premis/v2" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:rts="http://cosimo.stanford.edu/sdr/metsrights/" xmlns:mods="http://www.loc.gov/mods/v3" ID="DMR1">
<mets:mdRef xlink:href="http://oskicat.berkeley.edu/record=b10707847" LOCTYPE="URL" MDTYPE="MARC" LABEL="Catalog Record"/>
</mets:dmdSec>





In [80]:
print(etree.tostring(dmdSecs[1], encoding='utf-8').decode('utf-8'))

<mets:dmdSec xmlns:mets="http://www.loc.gov/METS/" xmlns="http://www.loc.gov/METS/" xmlns:mix="http://www.loc.gov/mix/" xmlns:moa2="http://sunsite.berkeley.edu/MOA2/" xmlns:cdl="http://www.cdlib.org/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:premis="http://www.loc.gov/standards/premis/v2" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:rts="http://cosimo.stanford.edu/sdr/metsrights/" xmlns:mods="http://www.loc.gov/mods/v3" ID="DMR2">
<mets:mdRef xlink:href="http://www.oac.cdlib.org/findaid/ark:/13030/tf6b69p1kk" XPTR="xpointer(id('m252266534'))" LOCTYPE="URL" MDTYPE="EAD" LABEL="Construction Photographs of the Golden Gate Bridge"/>
</mets:dmdSec>




In [81]:
print(etree.tostring(dmdSecs[2], encoding='utf-8', pretty_print=True).decode('utf-8'))

<mets:dmdSec xmlns:mets="http://www.loc.gov/METS/" xmlns="http://www.loc.gov/METS/" xmlns:mix="http://www.loc.gov/mix/" xmlns:moa2="http://sunsite.berkeley.edu/MOA2/" xmlns:cdl="http://www.cdlib.org/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:premis="http://www.loc.gov/standards/premis/v2" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:rts="http://cosimo.stanford.edu/sdr/metsrights/" xmlns:mods="http://www.loc.gov/mods/v3" ID="DM1">
 <mets:mdWrap MDTYPE="MODS" LABEL="Cable spinning">
  <mets:xmlData>
    <mods:mods><mods:titleInfo>
      <mods:title>Cable spinning</mods:title>
     </mods:titleInfo><mods:typeOfResource>still image</mods:typeOfResource><mods:relatedItem displayLabel="Metacollection" type="host">
      <mods:titleInfo>
       <mods:title>California Heritage Collection</mods:title>
      </mods:titleInfo>
      <mods:identifier type="uri">http://bancroft.berkeley.edu/collections/calheritage.html</mods:identifier>
     </mods:relatedItem><mods:relatedIt

In [82]:
for i in range(len(dmdSecs)):
    print(etree.tostring(dmdSecs[i], encoding='utf-8', pretty_print=True).decode('utf-8'))

<mets:dmdSec xmlns:mets="http://www.loc.gov/METS/" xmlns="http://www.loc.gov/METS/" xmlns:mix="http://www.loc.gov/mix/" xmlns:moa2="http://sunsite.berkeley.edu/MOA2/" xmlns:cdl="http://www.cdlib.org/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:premis="http://www.loc.gov/standards/premis/v2" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:rts="http://cosimo.stanford.edu/sdr/metsrights/" xmlns:mods="http://www.loc.gov/mods/v3" ID="DMR1">
<mets:mdRef xlink:href="http://oskicat.berkeley.edu/record=b10707847" LOCTYPE="URL" MDTYPE="MARC" LABEL="Catalog Record"/>
</mets:dmdSec>



<mets:dmdSec xmlns:mets="http://www.loc.gov/METS/" xmlns="http://www.loc.gov/METS/" xmlns:mix="http://www.loc.gov/mix/" xmlns:moa2="http://sunsite.berkeley.edu/MOA2/" xmlns:cdl="http://www.cdlib.org/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:premis="http://www.loc.gov/standards/premis/v2" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:rts="http://cosimo.stanford.edu/sdr/metsrig

In [83]:
amdSecs = mets_tree_lxml.findall('.//mets:amdSec', ns)

print(len(amdSecs))

1


In [84]:
for i in range(len(amdSecs)):
    print(etree.tostring(amdSecs[i], encoding='utf-8', pretty_print=True).decode('utf-8'))

<mets:amdSec xmlns:mets="http://www.loc.gov/METS/" xmlns="http://www.loc.gov/METS/" xmlns:mix="http://www.loc.gov/mix/" xmlns:moa2="http://sunsite.berkeley.edu/MOA2/" xmlns:cdl="http://www.cdlib.org/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:premis="http://www.loc.gov/standards/premis/v2" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:rts="http://cosimo.stanford.edu/sdr/metsrights/" xmlns:mods="http://www.loc.gov/mods/v3">

 <mets:techMD ID="ADM1">
  <mets:mdWrap MDTYPE="NISOIMG">
   <mets:xmlData>
    <mix:mix xmlns:mix="http://www.loc.gov/mix/v10">

     <mix:BasicDigitalObjectInformation>
       <mix:FormatDesignation>
   <mix:formatName>image/tiff</mix:formatName>
       </mix:FormatDesignation>
     </mix:BasicDigitalObjectInformation>

    </mix:mix>
   </mets:xmlData>
  </mets:mdWrap>
 </mets:techMD>

 <mets:techMD ID="ADM2">
  <mets:mdWrap MDTYPE="NISOIMG">
   <mets:xmlData>
    <mix:mix xmlns:mix="http://www.loc.gov/mix/v10">

     <mix:BasicDigitalObjectI

* exploring `fileSec`

In [45]:
fileSecs = mets_tree.findall('.//mets:fileSec', ns)

print(len(fileSecs))

1


In [46]:
print(ET.tostring(fileSecs[0], encoding='utf-8').decode('utf-8'))

<ns0:fileSec xmlns:ns0="http://www.loc.gov/METS/" xmlns:ns1="http://www.w3.org/1999/xlink">

 <ns0:fileGrp USE="image/master">
   <ns0:file ID="FID1" MIMETYPE="image/tiff" SEQ="1" CREATED="2009-05-06T15:54:51.77" ADMID="ADM1 ADM4" GROUPID="GID1">
    <ns0:FLocat ns1:href="http://nma.berkeley.edu/ark:/28722/bk000556f23" LOCTYPE="URL" />
   </ns0:file>
 </ns0:fileGrp>

 <ns0:fileGrp USE="image/thumbnail">
   <ns0:file ID="thumbnail" MIMETYPE="image/gif" SEQ="1" CREATED="2009-05-06T15:54:51.77" ADMID="ADM2" GROUPID="GID1" SIZE="19839" CHECKSUM="69d9fcbe6345176db779b7ff7b5ecd6e5bcd8bc8" CHECKSUMTYPE="SHA-256">
    <ns0:FLocat ns1:href="http://content.cdlib.org/dynaxml/data/13030/01/tf238nb201/files/tf238nb201-FID2.gif" /><ns0:FLocat ns1:href="http://nma.berkeley.edu/ark:/28722/bk000556f3n" LOCTYPE="URL" />
   </ns0:file>
 </ns0:fileGrp>

 <ns0:fileGrp USE="image/reference">
   <ns0:file ID="FID3" MIMETYPE="image/jpeg" SEQ="1" CREATED="2009-05-06T15:54:51.787" ADMID="ADM3" GROUPID="GID1" SI

* find everything with ID "FID1"

In [64]:
fileID1 = mets_tree.findall('.//*[@ID="FID1"]', ns)

for ref in fileID1:
    print(ref.tag, ref.attrib)

{http://www.loc.gov/METS/}file {'ID': 'FID1', 'MIMETYPE': 'image/tiff', 'SEQ': '1', 'CREATED': '2009-05-06T15:54:51.77', 'ADMID': 'ADM1 ADM4', 'GROUPID': 'GID1'}


* find all of the MODS fields

In [14]:
mods = mets_tree.findall('.//mods:*', namespaces=ns)

for element in mods:
    print(element.tag, element.attrib, '\n', element.text)

{http://www.loc.gov/mods/v3}mods {} 
 None
{http://www.loc.gov/mods/v3}titleInfo {} 
 
      
{http://www.loc.gov/mods/v3}title {} 
 Cable spinning
{http://www.loc.gov/mods/v3}typeOfResource {} 
 still image
{http://www.loc.gov/mods/v3}relatedItem {'displayLabel': 'Metacollection', 'type': 'host'} 
 
      
{http://www.loc.gov/mods/v3}titleInfo {} 
 
       
{http://www.loc.gov/mods/v3}title {} 
 California Heritage Collection
{http://www.loc.gov/mods/v3}identifier {'type': 'uri'} 
 http://bancroft.berkeley.edu/collections/calheritage.html
{http://www.loc.gov/mods/v3}relatedItem {'displayLabel': 'Collection', 'type': 'host'} 
 
      
{http://www.loc.gov/mods/v3}titleInfo {} 
 
       
{http://www.loc.gov/mods/v3}title {} 
 Construction Photographs of the Golden Gate Bridge
{http://www.loc.gov/mods/v3}identifier {'type': 'local'} 
 BANC PIC 1905.14251-.14284--PIC
{http://www.loc.gov/mods/v3}identifier {'type': 'uri'} 
 http://www.oac.cdlib.org/findaid/ark:/13030/tf6b69p1kk
{http://www.