Harvesting the data as required from AMS Acta

In [None]:
import requests
import xml.etree.ElementTree as ET
import re
import json
from sickle import Sickle
import os

# --- Step 1: Harvest ePrint IDs using OAI-PMH ---
sickle = Sickle('https://amsacta.unibo.it/cgi/oai2')
records = sickle.ListRecords(metadataPrefix='oai_dc')

eprint_ids = set()

print("🔍 Extracting ePrint IDs from OAI-PMH records...")
for record in records:
    if record.deleted:
        continue
    raw_xml = record.raw
    try:
        root = ET.fromstring(raw_xml)
        identifier = root.find('.//{http://www.openarchives.org/OAI/2.0/}identifier')
        if identifier is not None:
            match = re.search(r'(\d+)$', identifier.text)
            if match:
                eprint_ids.add(match.group(1))
    except ET.ParseError:
        continue

print(f"✅ Found {len(eprint_ids)} ePrint IDs.")
print("Sample ePrint IDs:", sorted(list(eprint_ids))[:10])

# --- Step 2: Download JSON metadata for each ePrint ID ---
exported_metadata = []

print("\n📥 Downloading JSON metadata for harvested ePrint IDs...")
for count, eid in enumerate(sorted(eprint_ids), start=1):
    url = f'https://amsacta.unibo.it/cgi/export/eprint/{eid}/JSON/amsacta-eprint-{eid}.json'
    try:
        res = requests.get(url)
        if res.status_code != 200:
            print(f"⚠️ HTTP {res.status_code} for ePrint {eid}, skipping...")
            continue
        
        content = res.text.strip()
        print(f"---\n[ePrint {eid}] Response snippet:\n{content[:500]}")  # print first 500 chars
        
        data = res.json()
        
        if not data:
            print(f"⚠️ Empty JSON for ePrint {eid}, skipping...")
            continue
        
        if isinstance(data, list):
            if len(data) == 0:
                print(f"⚠️ Empty list JSON for ePrint {eid}, skipping...")
                continue
            exported_metadata.append(data[0])
        elif isinstance(data, dict):
            exported_metadata.append(data)
        else:
            print(f"⚠️ Unexpected JSON type for ePrint {eid}, skipping...")
            continue
        
        if count % 50 == 0:
            print(f"✅ Downloaded metadata for {count} records so far...")
            
    except Exception as e:
        print(f"❌ Failed to fetch ePrint {eid}: {e}")

# --- Step 3: Save all valid metadata to JSON file ---
output_file = 'amsacta_cleaned_metadata.json'
print("Saving JSON to:", os.path.abspath(output_file))

try:
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(exported_metadata, f, ensure_ascii=False, indent=2)
    print(f"\n📦 Saved {len(exported_metadata)} valid records to '{output_file}'")
except Exception as e:
    print(f"❌ Failed to save JSON file: {e}")


In [None]:
import json
import csv

# === CONFIGURATION ===
input_file = 'amsacta_cleaned_metadata.json'    # Your JSON file path
csv_file = '/Users/martinapensalfini/Downloads/POSTPROCESS-iris-data-2025-05-27/ODS_L1_IR_ITEM_CON_PERSON.csv'                       # Your CSV file path (with ORCID column)
output_file = '/Users/martinapensalfini/Desktop/filtered_affiliation_or_orcid.json'  # Output JSON

# === STEP 1: Load ORCIDs from CSV ===
with open(csv_file, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    valid_orcids = {
        str(row.get('ORCID', '')).strip().lower()
        for row in reader
        if row.get('ORCID')
    }

# === STEP 2: Load JSON records ===
with open(input_file, 'r', encoding='utf-8') as f:
    records = json.load(f)

# === STEP 3: Filter records where affiliation contains "bologna" or "unibo" OR ORCID matches ===
filtered_records = []

for record in records:
    creators = record.get('creators', [])
    for creator in creators:
        affiliation = creator.get('affiliation', '')
        orcid_raw = creator.get('orcid')
        orcid = str(orcid_raw).strip().lower() if orcid_raw else ''

        affil = affiliation.lower() if isinstance(affiliation, str) else ''

        # Condition: affiliation contains 'bologna' or 'unibo', OR orcid in valid_orcids
        if ('bologna' in affil or 'unibo' in affil) or (orcid in valid_orcids):
            filtered_records.append(record)
            break  # Stop after first matching creator to avoid duplicates

# === STEP 4: Save filtered records ===
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(filtered_records, f, ensure_ascii=False, indent=2)

print(f"✅ Found {len(filtered_records)} records matching affiliation or ORCID criteria.")
print(f"📦 Saved to '{output_file}'")




