In [67]:
import pandas
import re
from lxml import etree
from pymarc import map_records, map_xml, XMLWriter, Field

### Input and output file control
The directory `input/cuc-export` is actually a simlink to another dir. Note that on macOS, you can't use Finder
aliases for such links, you have to create actual symlinks with `ln -s` for this to work.

#### File preparation
The `cuclookup` file on the list below needs to be a csv (e.g. generated in MarcEdit) containing a list of 
`020$a` and `035\$a` exported from the CUC file. Column must be named that way

In [68]:
chathamxml = '../stclair-duplicates/input/chatham-records-20210928-fixed.xml'
windsorxml = '../stclair-duplicates/input/windsor-records-20210928-fixed.xml'

cucfiles = ['input/cuc-export/CUC_bib_01_final.xml', \
            'input/cuc-export/CUC_bib_02_final.xml', \
            'input/cuc-export/CUC_bib_03_final.xml', \
            'input/cuc-export/CUC_bib_04_final.xml', \
            'input/cuc-export/CUC_bib_05_final.xml']

cuclookup = 'input/CUC-020-035-lookup.csv'

chathamoutxml = 'output/chatham-records-cuc035-20220319.xml'
windsoroutxml = 'output/windsor-records-cuc035-20220319.xml'

### Build CUC lookup dictionary
This will be used to look up any 020s found in the St. Clair files and get the corresponding 035s if found.
If more than one 020 exists for a particular record, an additional dictionary entry is created. Records where 035 is `nan` are excluded.

035s can be multiple values split by `;` too, they will be handled individually in the mapping function.

In [69]:
cuc_df = pandas.read_csv(cuclookup)
cuc_dict = {}
def construct_dict(line):
    ohtwooh = str(line['020$a'])
    ohthreefive = str(line['035$a'])
    if ohthreefive != 'nan':
        isbns = ohtwooh.split(';')
        if len(isbns) > 0:
            for isbn in isbns:
                cuc_dict[isbn]=ohthreefive

cuc_df.apply(construct_dict, axis=1)

0         None
1         None
2         None
3         None
4         None
          ... 
395315    None
395316    None
395317    None
395318    None
395319    None
Length: 395320, dtype: object

### ISBN Helper functions
Adapted from [ISBN-13 converter (Python recipe)](https://code.activestate.com/recipes/498104-isbn-13-converter/)

In [70]:
def check_digit_10(isbn):
    assert len(isbn) == 9
    sum = 0
    for i in range(len(isbn)):
        c = int(isbn[i])
        w = i + 1
        sum += w * c
    r = sum % 11
    if r == 10: return 'X'
    else: return str(r)

def check_digit_13(isbn):
    assert len(isbn) == 12
    sum = 0
    for i in range(len(isbn)):
        c = int(isbn[i])
        if i % 2: w = 3
        else: w = 1
        sum += w * c
    r = 10 - (sum % 10)
    if r == 10: return '0'
    else: return str(r)

def convert_10_to_13(isbn):
    assert len(isbn) == 10
    prefix = '978' + isbn[:-1]
    check = check_digit_13(prefix)
    return prefix + check

def convert_13_to_10(isbn):
    assert len(isbn) == 13
    prefix = isbn[3:-1]
    check = check_digit_10(prefix)
    return prefix + check

### Define mapping function
This function will be run on each record in the St.Clair dataset.

In [89]:
# This function checks if the returned 035 is prefixed with either (OCoLC), (Sirsi) or (Local)
# and return the first one found in that order, or None if none is found.
def fieldmatch(array):
    patterns = ['\(OCoLC\).*','\(Sirsi\).*','\(Local\).*']
    for pattern in patterns:
        regex_object = re.compile(pattern)
        for string in array:
            result = regex_object.match(string)
            if result != None:
                return result.group()
    return None


# This is the mapping function that is run on each source record
def stclaircuc_map(record):
    global counter
    if record:
        bibid = int(record['001'].value())
        ohtwooh = record['020']
        if (ohtwooh != None):
            # Extract only the first part of the 020 string (before first space separator)
            isbnraw = record['020'].value().split( )[0]
            # Convert it into other ISBN variants
            isbn_nodash = isbnraw.replace('-','')
            if len(isbn_nodash) == 10:
                isbn_13 = convert_10_to_13(isbn_nodash)
                isbn_10 = isbn_nodash
            elif len(isbn_nodash) == 13:
                isbn_13 = isbn_nodash
                isbn_10 = convert_13_to_10(isbn_nodash)
            else:
                # If the 020 string is neither 10 nor 13 characters long, it's likely not an ISBN, 
                # keep this record as-is and go to the next.
                writer.write(record)
                return
            isbn_13_dashed = "-".join([isbn_13[0:3], isbn_13[3], isbn_13[4:7], isbn_13[7:12], isbn_13[12]])
            isbn_10_dashed = "-".join([isbn_10[0], isbn_10[1:4], isbn_10[4:9], isbn_10[9]])
            
            isbn_variants = [isbn_13, isbn_10, isbn_13_dashed, isbn_10_dashed]
            #print(bibid, ': ', isbn_variants)
            
            # Try to find a CUC record with one of those ISBN variants
            for isbn in isbn_variants:
                find035 = cuc_dict.get(isbn)
                if find035 != None:
                    # Check if one of the 035 for that record are among the ones we're looking for
                    ohthreefive = fieldmatch(find035.split(';'))
                    if ohthreefive != None:         
                        # Check first if that 035 doesn't exist in the record already
                        control = False
                        existing_fields = record.get_fields('035')
                        for existing in existing_fields:
                            subfields = existing.get_subfields('a')
                            if (len(subfields) > 0) and (subfields[0] == ohthreefive):
                                control = True
                        
                        if control == False:
                            record.add_field(
                                Field(
                                    tag = '035',
                                    indicators = ['',''],
                                    subfields = [
                                        'a', ohthreefive
                                    ]
                                )
                            )
                            #print(bibid,',',ohthreefive,',',isbn)
                            counter = counter + 1
                    break
                    
        writer.write(record)
    else:
        print('Could not read record')

### Run the mapping function
This is where the magic happen. The mapping function is run on the Chatham and Windsor files.

In [92]:
writer = XMLWriter(open(chathamoutxml,'wb'))
counter = 0
map_xml(stclaircuc_map, chathamxml)
writer.close()
print("Enriched ", counter, " Chatham records!")

Enriched  1240  Chatham records!


In [93]:
writer = XMLWriter(open(windsoroutxml,'wb'))
counter = 0
map_xml(stclaircuc_map, windsorxml)
writer.close()
print("Enriched ", counter, " Windsor records!")

Enriched  2978  Windsor records!


### Old, inefficient way
This was a terrible way to solve the problem. Xpath lookups are very slow. Stored here for posterity.

In [None]:
cucparser = etree.XMLParser(encoding='utf-8',resolve_entities=False)

# This probably takes a good deal of memory...
cuctrees = list(map(lambda infile: etree.parse(infile, cucparser), cucfiles))

# For reference, this is the old function that used a live xpath lookup in the CUC.
def stclaircuc_map_old(record):
    if record:
        bibid = int(record['001'].value())
        ohtwooh = record['020']
        if (ohtwooh != None):
            # Extract only the first part of the 020 string (before first space separator)
            isbnraw = record['020'].value().split( )[0]
            # Convert it into other ISBN variants
            isbn_nodash = isbnraw.replace('-','')
            if len(isbn_nodash) == 10:
                isbn_13 = convert_10_to_13(isbn_nodash)
                isbn_10 = isbn_nodash
            elif len(isbn_nodash) == 13:
                isbn_13 = isbn_nodash
                isbn_10 = convert_13_to_10(isbn_nodash)
            else:
                # If the 020 string is neither 10 nor 13 characters long, it's likely not an ISBN, 
                # keep this record as-is and go to the next.
                writer.write(record)
                return
            isbn_13_dashed = "-".join([isbn_13[0:3], isbn_13[3], isbn_13[4:7], isbn_13[7:12], isbn_13[12]])
            isbn_10_dashed = "-".join([isbn_10[0], isbn_10[1:4], isbn_10[4:9], isbn_10[9]])
            
            isbn_variants = [isbn_13, isbn_10, isbn_13_dashed, isbn_10_dashed]
            #print(bibid, ': ', isbn_variants)
            
            # Try to find a CUC record with one of those ISBN variants
            xquery = "//marc:datafield[@tag='020'][marc:subfield='" \
                     + "' or marc:subfield='".join(isbn_variants) \
                     +"']/../marc:datafield[@tag='035']//text()"
            #print(bibid, ': ', xquery)
            for cuctree in cuctrees:
                find035 = cuctree.xpath(xquery,namespaces = {"marc": "http://www.loc.gov/MARC21/slim"})
                if len(find035) > 0:
                    print(bibid, ': ', isbn_variants)
                    print(bibid, ': ', find035)
                    break
            
            for ohthreefive in find035:
                record.add_field(
                    Field(
                        tag = '035',
                        indicators = ['',''],
                        subfields = [
                            'a', ohthreefive
                        ]
                    )
                )
        writer.write(record)
    else:
        print('Could not read record')