In [16]:
# Cf. https://www.openarchives.org/OAI/openarchivesprotocol.html#ListRecords
import urllib.request as rq
from bs4 import BeautifulSoup as bs
import os
from datetime import datetime

                
# MAIN PARAMETERS
output_dir = os.path.join(os.getcwd(), 'output')
baseurl = 'https://collections.britishart.yale.edu/oaicatmuseum/OAIHandler'
setspec = ['ycba:pd', 'ycba:ps', 'ycba:frames']
metaf = 'lido'


def timestamp():
    return datetime.utcnow().isoformat()


def create_logfile(log_path):
    """
    Creates a (txt) log file
    """
    if os.path.isfile(log_path):
        os.remove(log_path)
    log_stream = open(log_path, 'w+')
    return log_stream


def write_to_log(log_path, message):
    """
    Writes to (txt) log file
    """
    with open(log_path, 'a') as log:

        if isinstance(message, str):
            mes_str = '\n' + '/!\\ ' + message
            log.write(mes_str)

        elif isinstance(message, list):
            mes_main = '\n' + '/!\\ ' + message[0]
            log.write(mes_main)
            for p in message[1:]:
                mes_p = '\n' + '\t' + ' - ' + p
                log.write(mes_p)

                
def oaipmh_harvest(baseurl, set_list, data_format, output_dir):
    # main variables/directories and log
    run_dir = os.path.join(output_dir, timestamp())
    data_dir = os.path.join(run_dir, 'data')
    log_dir = os.path.join(run_dir, 'log')
    log_path = log_dir + '/log.txt'
    for f in [run_dir, data_dir, log_dir]:
        os.mkdir(f)
    
    # init log stream
    log_stream = create_logfile(log_path)
    
    # loop to fetch files
    for s in set_list:
        set_dir = os.path.join(data_dir, s)
        os.mkdir(set_dir)
        i = 0 
        write_to_log(log_path, "-------- logs : setspec '" + s +"' --------")
        # try:
        while i >= 0:
        # while i == 0:
            # rt = str()
            filepath = os.path.join(set_dir, 'page_' + str(i) + '.xml')
            full_url = baseurl + '?verb=ListRecords&metadataPrefix=' + data_format + '&set=' + s
 
            # print(full_url)
            # logs
            mes_1 = timestamp() + ' --> Downloading page ' + str(i) + ' ... '
            print(mes_1)
            mes_11 ='from ' + full_url
            mes_111 = 'to   :' + filepath
            write_to_log(log_path, [mes_1, mes_11, mes_111])
            
            # fetch url to file
            if i > 0:
                full_url += '&resumptionToken=' + rt
            print(full_url)
            rq.urlretrieve(full_url, filepath)
            i += 1
            
            # get Resumption token
            # with open(test_file) as fp: soup = bs(fp, 'xml')
            with open(filepath) as fp: soup = bs(fp, 'xml')
            rt_data = soup.find_all('resumptionToken')
            if len(rt_data) > 0 and i > 0:
                rt = rt_data[0].get_text()
                mes_2 = '--> Resumption token: ' + rt
                write_to_log(log_path, mes_2)
                print(mes_2)

            # check response if issue
            else:
                print('Finished or issue ...')
                error = soup.find_all('error')
                if len(error) > 0:
                    mes = 'Error code: ' + error[0]['code'] + '\n--> ' + error[0].get_text()
                    write_to_log(log_path, mes)
                    print(mes)

                break
        # except:
  #   print('Finished or issue ...')
    log_stream.close()

# start harvest
oaipmh_harvest(baseurl, setspec, metaf, output_dir)


2019-11-15T20:51:34.809025 --> Downloading page 0 ... 
https://collections.britishart.yale.edu/oaicatmuseum/OAIHandler?verb=ListRecords&metadataPrefix=lido&set=ycba:pd
--> Resumption token: 0001-01-01:9999-12-31:ycba%3Apd:500:lido
2019-11-15T20:51:56.078661 --> Downloading page 1 ... 
https://collections.britishart.yale.edu/oaicatmuseum/OAIHandler?verb=ListRecords&metadataPrefix=lido&set=ycba:pd&resumptionToken=0001-01-01:9999-12-31:ycba%3Apd:500:lido
Finished or issue ...
Error code: badArgument
--> The request includes illegal arguments, is missing required arguments, includes a repeated argument, or values for arguments have an illegal syntax.
2019-11-15T20:51:56.481334 --> Downloading page 0 ... 
https://collections.britishart.yale.edu/oaicatmuseum/OAIHandler?verb=ListRecords&metadataPrefix=lido&set=ycba%3Aframes
--> Resumption token: 0001-01-01:9999-12-31:ycba%3Aframes:500:lido
2019-11-15T20:52:15.947381 --> Downloading page 1 ... 
https://collections.britishart.yale.edu/oaicatmus