In [None]:




import re
import xml.etree.ElementTree as ET

# Input and output paths
input_file = r"C:/Users/mchung94/Downloads/file_0.xml"
output_file_regular = r"C:/Users/mchung94/Downloads/OCLCNumbers_Nomatch_print.xml"
output_file_online = r"C:/Users/mchung94/Downloads/OCLCNumbers_Nomatch_online.xml"

# Parse XML
tree = ET.parse(input_file)
root = tree.getroot()

# Regex for extracting OCLC number
pattern = re.compile(r"\(OCoLC\)(\d+)")

# Two lists for output
oclc_regular = []
oclc_online = []

# Loop through each record
for record in root.iter("record"):
    # Normalize 006 fields
    control006 = [
        (cf.text or "").strip() for cf in record.findall("controlfield") if cf.get("tag") == "006"
    ]
    # Online check: if any 006 starts with "m""
    is_online = any(txt.startswith("m") for txt in control006)

    # Collect OCLC numbers from subfield code="a"
    record_numbers = []
    for subfield in record.iter("subfield"):
        if subfield.get("code") == "a":
            match = pattern.search(subfield.text or "")
            if match:
                record_numbers.append(match.group(1))
    
    # Put the record's numbers into the correct file list
    if record_numbers:
        if is_online:
            oclc_online.extend(record_numbers)
        else:
            oclc_regular.extend(record_numbers)

# Function to write plain text XML list
def write_xml(filepath, numbers):
    with open(filepath, "w", encoding="utf-8") as f:
        f.write('<?xml version="1.0" encoding="utf-8"?>\n')
        f.write("<OCLCNumbers>\n")
        for num in numbers:
            f.write(f"{num}\n")
        f.write("</OCLCNumbers>\n")

# Write files
if oclc_regular:
    write_xml(output_file_regular, oclc_regular)
if oclc_online:
    write_xml(output_file_online, oclc_online)

print("✅ OCLC numbers have been extracted and saved.")
if oclc_regular:
    print(f" - Regular records: {output_file_regular}")
if oclc_online:
    print(f" - Online records: {output_file_online}")


✅ OCLC numbers have been extracted and saved.
 - Regular records: C:/Users/mchung94/Downloads/OCLCNumbers.xml
 - Online records: C:/Users/mchung94/Downloads/OCLCNumbers_NotImported_online.xml
