Filtering the results of the extraction, according to "University of Bologna" and the ORCID

In [None]:
import json
import csv

# === CONFIGURATION ===
input_file = 'amsacta_cleaned_metadata.json'    # Your JSON file path
csv_file = 'POSTPROCESS-iris-data-2025-05-27/ODS_L1_IR_ITEM_CON_PERSON.csv'                       # Your CSV file path (with ORCID column)
output_file = 'filtered_affiliation_or_orcid.json'  # Output JSON

# === STEP 1: Load ORCIDs from CSV ===
with open(csv_file, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    valid_orcids = {
        str(row.get('ORCID', '')).strip().lower()
        for row in reader
        if row.get('ORCID')
    }

# === STEP 2: Load JSON records ===
with open(input_file, 'r', encoding='utf-8') as f:
    records = json.load(f)

# === STEP 3: Filter records where affiliation contains "bologna" or "unibo" OR ORCID matches ===
filtered_records = []

for record in records:
    creators = record.get('creators', [])
    for creator in creators:
        affiliation = creator.get('affiliation', '')
        orcid_raw = creator.get('orcid')
        orcid = str(orcid_raw).strip().lower() if orcid_raw else ''

        affil = affiliation.lower() if isinstance(affiliation, str) else ''

        # Condition: affiliation contains 'bologna' or 'unibo', OR orcid in valid_orcids
        if ('bologna' in affil or 'unibo' in affil) or (orcid in valid_orcids):
            filtered_records.append(record)
            break  # Stop after first matching creator to avoid duplicates

# === STEP 4: Save filtered records ===
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(filtered_records, f, ensure_ascii=False, indent=2)

print(f"✅ Found {len(filtered_records)} records matching affiliation or ORCID criteria.")
print(f"📦 Saved to '{output_file}'")




