In [6]:
import pandas
from lxml import etree
from pymarc import map_records, map_xml, XMLWriter, Field

### Input and output file control
The directory `input/cuc-export` is actually a simlink to another dir. Note that on macOS, you can't use Finder
aliases for such links, you have to create actual symlinks with `ln -s` for this to work.

In [167]:
chathamxml = '../stclair-duplicates/input/chatham-records-20210928-fixed.xml'
windsorxml = '../stclair-duplicates/input/windsor-records-20210928-fixed.xml'

cucfiles = ['input/cuc-export/CUC_bib_01_final.xml', \
            'input/cuc-export/CUC_bib_02_final.xml', \
            'input/cuc-export/CUC_bib_03_final.xml', \
            'input/cuc-export/CUC_bib_04_final.xml', \
            'input/cuc-export/CUC_bib_05_final.xml']

cucfile = 'input/cuc-export/CUC_bib_05_final.xml'

chathamoutxml = 'output/chatham-records-cuc035-20220319.xml'
windsoroutxml = 'output/windsor-records-cuc035-20220319.xml'

In [170]:
cucparser = etree.XMLParser(encoding='utf-8',resolve_entities=False)
#cuctree = etree.parse(cucfile, cucparser)

# This probably takes a good deal of memory...
cuctrees = list(map(lambda infile: etree.parse(infile, cucparser), cucfiles))

### ISBN Helper functions
Adapted from [ISBN-13 converter (Python recipe)](https://code.activestate.com/recipes/498104-isbn-13-converter/)

In [120]:
def check_digit_10(isbn):
    assert len(isbn) == 9
    sum = 0
    for i in range(len(isbn)):
        c = int(isbn[i])
        w = i + 1
        sum += w * c
    r = sum % 11
    if r == 10: return 'X'
    else: return str(r)

def check_digit_13(isbn):
    assert len(isbn) == 12
    sum = 0
    for i in range(len(isbn)):
        c = int(isbn[i])
        if i % 2: w = 3
        else: w = 1
        sum += w * c
    r = 10 - (sum % 10)
    if r == 10: return '0'
    else: return str(r)

def convert_10_to_13(isbn):
    assert len(isbn) == 10
    prefix = '978' + isbn[:-1]
    check = check_digit_13(prefix)
    return prefix + check

def convert_13_to_10(isbn):
    assert len(isbn) == 13
    prefix = isbn[3:-1]
    check = check_digit_10(prefix)
    return prefix + check

### Define mapping function
This function will be run on each record in the St.Clair dataset.

In [171]:
def stclaircuc_map(record):
    if record:
        bibid = int(record['001'].value())
        ohtwooh = record['020']
        if (ohtwooh != None):
            # Extract only the first part of the 020 string (before first space separator)
            isbnraw = record['020'].value().split( )[0]
            # Convert it into other ISBN variants
            isbn_nodash = isbnraw.replace('-','')
            if len(isbn_nodash) == 10:
                isbn_13 = convert_10_to_13(isbn_nodash)
                isbn_10 = isbn_nodash
            elif len(isbn_nodash) == 13:
                isbn_13 = isbn_nodash
                isbn_10 = convert_13_to_10(isbn_nodash)
            else:
                # If the 020 string is neither 10 nor 13 characters long, it's likely not an ISBN, 
                # keep this record as-is and go to the next.
                writer.write(record)
                return
            isbn_13_dashed = "-".join([isbn_13[0:3], isbn_13[3], isbn_13[4:7], isbn_13[7:12], isbn_13[12]])
            isbn_10_dashed = "-".join([isbn_10[0], isbn_10[1:4], isbn_10[4:9], isbn_10[9]])
            
            isbn_variants = [isbn_13, isbn_10, isbn_13_dashed, isbn_10_dashed]
            #print(bibid, ': ', isbn_variants)
            
            # Try to find a CUC record with one of those ISBN variants
            xquery = "//marc:datafield[@tag='020'][marc:subfield='" \
                     + "' or marc:subfield='".join(isbn_variants) \
                     +"']/../marc:datafield[@tag='035']//text()"
            #print(bibid, ': ', xquery)
            for cuctree in cuctrees:
                find035 = cuctree.xpath(xquery,namespaces = {"marc": "http://www.loc.gov/MARC21/slim"})
                if len(find035) > 0:
                    print(bibid, ': ', isbn_variants)
                    print(bibid, ': ', find035)
                    break
            
            for ohthreefive in find035:
                record.add_field(
                    Field(
                        tag = '035',
                        indicators = ['',''],
                        subfields = [
                            'a', ohthreefive
                        ]
                    )
                )
        writer.write(record)
    else:
        print('Could not read record')

In [172]:
writer = XMLWriter(open(chathamoutxml,'wb'))
map_xml(stclaircuc_map, chathamxml)
writer.close()

73 :  ['9780525076452', '052507645X', '978-0-525-07645-2', '0-525-07645-X']
73 :  ['(Sirsi) AEJ-1110']
343 :  ['9781895579345', '1895579341', '978-1-895-57934-5', '1-895-57934-1']
343 :  ['(Sirsi) AEE-3382']
649 :  ['9780710063403', '0710063407', '978-0-710-06340-3', '0-710-06340-7']
649 :  ['(Sirsi) ABY-5118']
669 :  ['9780134930176', '013493017X', '978-0-134-93017-6', '0-134-93017-X']
669 :  ['(Sirsi) ACH-9008']
688 :  ['9780030615054', '0030615054', '978-0-030-61505-4', '0-030-61505-4']
688 :  ['(Sirsi) AEJ-2778']
730 :  ['9780674360334', '0674360338', '978-0-674-36033-4', '0-674-36033-8']
730 :  ['(Sirsi) AEJ-2867']
733 :  ['9780060134549', '0060134542', '978-0-060-13454-9', '0-060-13454-2']
733 :  ['(VGER)31049', '(CA-OTSC)31049-senlcdb-Voyager']
837 :  ['9781550210767', '1550210769', '978-1-550-21076-7', '1-550-21076-9']
837 :  ['(Sirsi) AEE-4345']
933 :  ['9780070651296', '0070651299', '978-0-070-65129-6', '0-070-65129-9']
933 :  ['(Sirsi) AEE-4950']
941 :  ['9780888790309', '08

5979 :  ['9780871920195', '0871920190', '978-0-871-92019-5', '0-871-92019-0']
5979 :  ['(Sirsi) AEG-4354']
5987 :  ['9780395205488', '0395205484', '978-0-395-20548-8', '0-395-20548-4']
5987 :  ['(Sirsi) AEM-3539']
5994 :  ['9780823002207', '0823002209', '978-0-823-00220-7', '0-823-00220-9']
5994 :  ['(Sirsi) AEG-4397']
5995 :  ['9780486200828', '0486200825', '978-0-486-20082-8', '0-486-20082-5']
5995 :  ['(Sirsi) AHM-6178']
6021 :  ['9780772005533', '0772005532', '978-0-772-00553-3', '0-772-00553-2']
6021 :  ['(Sirsi) AEM-3949']
6022 :  ['9780771090295', '0771090293', '978-0-771-09029-5', '0-771-09029-3']
6022 :  ['(Sirsi) AEM-3956']
6023 :  ['9780887680250', '0887680259', '978-0-887-68025-0', '0-887-68025-9']
6023 :  ['(Sirsi) AEM-3959']
6024 :  ['9780771058158', '0771058152', '978-0-771-05815-8', '0-771-05815-2']
6024 :  ['(Sirsi) AAD-2231']
6026 :  ['9780772002198', '0772002193', '978-0-772-00219-8', '0-772-00219-3']
6026 :  ['(Sirsi) AEM-3992']
6027 :  ['9780772005670', '0772005672

6597 :  ['9780195401752', '0195401751', '978-0-195-40175-2', '0-195-40175-1']
6597 :  ['(Sirsi) AEG-8988']
6610 :  ['9780771022128', '0771022123', '978-0-771-02212-8', '0-771-02212-3']
6610 :  ['(Sirsi) AEN-3481']
6614 :  ['9780771022159', '0771022158', '978-0-771-02215-9', '0-771-02215-8']
6614 :  ['(Sirsi) AEN-3489']
6617 :  ['9780772004437', '0772004439', '978-0-772-00443-7', '0-772-00443-9']
6617 :  ['(Sirsi) AEG-9042']
6619 :  ['9780771091506', '0771091508', '978-0-771-09150-6', '0-771-09150-8']
6619 :  ['(Sirsi) AEN-3223']
6621 :  ['9780887500770', '0887500773', '978-0-887-50077-0', '0-887-50077-3']
6621 :  ['(Sirsi) AEG-9081']
6622 :  ['9780442229184', '0442229185', '978-0-442-22918-4', '0-442-22918-5']
6622 :  ['(Sirsi) AEN-3606']
6627 :  ['9780771091704', '0771091702', '978-0-771-09170-4', '0-771-09170-2']
6627 :  ['(Sirsi) AEN-3690']
6649 :  ['9780887706004', '0887706002', '978-0-887-70600-4', '0-887-70600-2']
6649 :  ['(Sirsi) AEN-3890']
6654 :  ['9780771074967', '0771074964

KeyboardInterrupt: 