In [1]:
# pull data from EFG OAI endpoint.

import datetime
import pathlib
import pydash
import requests
import time
import urllib.parse
import xmltodict

def requester(url, retries, delay):

    ''' Request web resource with timed retries in case of error. '''

    for n in range(retries):
        time.sleep(delay)
        try:
            return requests.get(url)
        except:
            pass

    raise Exception(f'Could not connect to url {url}')

def resumption_token(data):

    ''' Extract resumption token. '''

    data = xmltodict.parse(data)
    token = pydash.get(data, 'oai:OAI-PMH.oai:ListRecords.oai:resumptionToken.#text')

    return token

def xml_save(xml, flavour):

    ''' Save XML. '''

    path = pathlib.Path.cwd() / 'data'
    path = path / flavour['set'] / flavour['timestamp']
    path = path / f"{flavour['token']}.xml"
    path.parents[0].mkdir(parents=True, exist_ok=True)
    with open(path, "w") as export:
        export.write(xml)

    print(path.name)

retry = 4  # global retry.
delay = 10  # global delay.

r = requester(
    'https://dnet-prod.efg.d4science.org/efg/mvc/oai/oai.do?verb=ListSets',
    retry, delay)
set_list = xmltodict.parse(r.text)
set_list = pydash.get(set_list, 'OAI-PMH.ListSets.set')
set_list = [x['setSpec'] for x in set_list]

# test sample
set_list = [x for x in set_list if x in ['barch']]

for x in set_list:

    timestamp = datetime.datetime.now().strftime('%y%m%d%H%M%S')
    config = {'set': x, 'token': x, 'timestamp': timestamp}
    url = f'https://dnet-prod.efg.d4science.org/efg/mvc/oai/oai.do?verb=ListRecords&metadataPrefix=efg&set={x}'
    payload = requester(url, retry, delay).text

    xml_save(payload, config)
    token = resumption_token(payload)
    if token:
        while 1:
            config['token'] = urllib.parse.quote(token)
            r = requester(
                f'https://dnet-prod.efg.d4science.org/efg/mvc/oai/oai.do?verb=ListRecords&resumptionToken={token}', retry, delay)
            payload = r.text
            xml_save(payload, config)
            token = resumption_token(payload)
            if not token:
                break

print('all done.')

barch.xml
254%7Cefg%7C%2A%20AND%20%28set%20%3D%20%22barch%22%29%20%7C100%7C62ed2a6f7222126baaa9fb07%7Cfalse%7Cbarch.xml
254%7Cefg%7C%2A%20AND%20%28set%20%3D%20%22barch%22%29%20%7C200%7C62ed2a707222126baaa9fb6b%7Cfalse%7Cbarch.xml
all done.
