# Sample - retrieving DC using OAI-PMH

In [1]:
import requests

In [2]:
rencen_item_url_ex = 'https://cdm17409.contentdm.oclc.org/digital/collection/rencen/id/315/rec/3'
collection_url = 'https://cdm17409.contentdm.oclc.org/digital'
item_set = 'rencen'
item_id = '3'
oai_endpoint = 'https://cdm17409.contentdm.oclc.org/oai/oai.php'

### Identify the repository

In [3]:
parameters = {
    'verb':'Identify'
}

In [4]:
r = requests.get(oai_endpoint, params=parameters)
r.url

'https://cdm17409.contentdm.oclc.org/oai/oai.php?verb=Identify'

In [5]:
print(r.text)

<?xml version="1.0" encoding="UTF-8"?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2024-10-17T21:14:43Z</responseDate><request verb="Identify">http://cdm17409.contentdm.oclc.org/oai/oai.php</request><Identify>
      <repositoryName>CONTENTdm Server Repository</repositoryName>
      <baseURL>http://cdm17409.contentdm.oclc.org/oai/oai.php</baseURL>
      <protocolVersion>2.0</protocolVersion>
      <adminEmail>digitalcollections@wayne.edu</adminEmail>
      <earliestDatestamp>2022-05-04</earliestDatestamp>
      <deletedRecord>transient</deletedRecord>
      <granularity>YYYY-MM-DD</granularity>
   </Identify>
  </OAI-PMH>


### Get a specific record

https://cdm17409.contentdm.oclc.org/oai/oai.php?verb=GetRecord&identifier=oai:cdm17409.contentdm.oclc.org:rencen/3&metadataPrefix=oai_dc

In [6]:
parameters = {
    'verb':'GetRecord',
    'identifier':'oai:cdm17409.contentdm.oclc.org:rencen/3',
    'metadataPrefix':'oai_dc'
}

In [7]:
r = requests.get(oai_endpoint, params=parameters)
r.url

'https://cdm17409.contentdm.oclc.org/oai/oai.php?verb=GetRecord&identifier=oai%3Acdm17409.contentdm.oclc.org%3Arencen%2F3&metadataPrefix=oai_dc'

In [8]:
print(r.text)

<?xml version="1.0" encoding="UTF-8"?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2024-10-17T21:14:43Z</responseDate><request verb="GetRecord" identifier="oai:cdm17409.contentdm.oclc.org:rencen/3" metadataPrefix="oai_dc">http://cdm17409.contentdm.oclc.org/oai/oai.php</request><GetRecord><record><header><identifier>oai:cdm17409.contentdm.oclc.org:rencen/3</identifier><datestamp>2023-03-20</datestamp><setSpec>rencen</setSpec></header>
<metadata>
<oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:title>View of the atrium in the completed Renaissance Center.</dc:title>
<dc:descri

#### Parse markup with beautful soup 

In [9]:
from bs4 import BeautifulSoup

In [10]:
dc_example_parsed = BeautifulSoup(r.text, 'xml')

In [11]:
for tag in dc_example_parsed:
    print(tag.name)

OAI-PMH


In [12]:
dc_example_parsed.find_all('dc:coverage')

[<dc:coverage>Detroit, Michigan</dc:coverage>,
 <dc:coverage>1970s</dc:coverage>]

In [13]:
for tag in dc_example_parsed.find('metadata'):
    print(tag)



<oai_dc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:title>View of the atrium in the completed Renaissance Center.</dc:title>
<dc:description>Image of the atrium within the Renaissance Center, looking up at a large concrete structure. Trees, plants, and people can be seen in this picture of a restaurant.</dc:description>
<dc:identifier>rencen_03e</dc:identifier>
<dc:date>1977</dc:date>
<dc:type>still image</dc:type>
<dc:format>photographs</dc:format>
<dc:subject>Renaissance Center (Detroit, Mich.)</dc:subject>
<dc:coverage>Detroit, Michigan</dc:coverage>
<dc:coverage>1970s</dc:coverage>
<dc:relation>Building the Detroit Renaissance Center</dc:relation>
<dc:rights>Users can cite and link to these materials without obtaining permission. Users can also use th

## Transform that data into something you can use

For example, do you need your data in a CSV? You can use the above to parse HTML.
Or, as demonstrated below, use a dedicated XML library like `lxml`.

In [14]:
import csv
import xml.etree.ElementTree as ET

Here's a function that transforms OAI PMH XML data into a CSV:

In [15]:
def oai_pmh_to_csv(xml_data, csv_filename):
    # Parse the XML data
    root = ET.fromstring(xml_data)

    # Define the namespaces to search within the XML
    ns = {
        'oai_pmh': 'http://www.openarchives.org/OAI/2.0/',
        'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
        'dc': 'http://purl.org/dc/elements/1.1/'
    }

    # List to hold CSV rows
    csv_data = []

    # Extract the metadata
    for record in root.findall('.//oai_pmh:record', ns):
        row = {}
        metadata = record.find('.//oai_dc:dc', ns)
        if metadata is not None:
            for child in metadata:
                tag = child.tag.split('}')[1]  # Removing the namespace
                if tag in row:
                    row[tag] += '; ' + child.text
                else:
                    row[tag] = child.text
            csv_data.append(row)

    # Define CSV headers based on the first row keys
    headers = set(k for d in csv_data for k in d.keys())

    # Write to CSV
    with open(csv_filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()
        for row in csv_data:
            writer.writerow(row)

Now, define an output file name, retrieve the XML data and save it to a variable,
and run the above function

In [16]:
# define an output file
csv_filename = 'transformed-oai-pmh-to-dc-in-csv.csv'

# save the OAI-PMH DublinCore into a variable
oa_pmh_dc_in_xml = r.text

Run the function:

In [17]:
oai_pmh_to_csv(oa_pmh_dc_in_xml, csv_filename)

Check the file!

In [18]:
!cat transformed-oai-pmh-to-dc-in-csv.csv

format,rights,date,relation,identifier,title,subject,description,coverage,type
photographs,"Users can cite and link to these materials without obtaining permission. Users can also use the materials for non-commercial educational and research purposes in accordance with fair use. For other uses or to obtain high resolution images, please contact the copyright holder.",1977; 1977,Building the Detroit Renaissance Center,rencen_03e; http://cdm17409.contentdm.oclc.org/cdm/ref/collection/rencen/id/3,View of the atrium in the completed Renaissance Center.,"Renaissance Center (Detroit, Mich.)","Image of the atrium within the Renaissance Center, looking up at a large concrete structure. Trees, plants, and people can be seen in this picture of a restaurant.","Detroit, Michigan; 1970s",still image
