Extraction of data from Zenodo for affiliation and ORCID

In [None]:
import requests
import json
import time
import pandas as pd
from pathlib import Path

# loading IRIS to get UNIBO researchers ORCID IDs
Iris_df = pd.read_csv(r"C:\Users\pietr\Desktop\OpenScience\ODS_L1_IR_ITEM_CON_PERSON.csv", encoding="utf-8")
unibo_orcids = Iris_df["ORCID"].dropna().unique().tolist()

base_url = "https://zenodo.org/api/records"

# Search parameters with query for UNIBO
params = {
    'q': 'contributors.affiliation:"University of Bologna" OR contributors.affiliation:"Università di Bologna" OR '
         'contributors.affiliation:"UNIBO" OR contributors.affiliation:"Alma Mater Studiorum" OR '
         'creators.affiliation:"University of Bologna" OR creators.affiliation:"Università di Bologna" OR '
         'creators.affiliation:"UNIBO" OR creators.affiliation:"Alma Mater Studiorum"',
    'size': 100,  
    'page': 1,    
}

all_records = []
orcid_records = []
total_records = 0
processed_records = 0

continue_pagination = True

try:
    # Get records with Bologna in the affiliation
    while continue_pagination:
       
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise exception for HTTP errors
        data = response.json()
        
        # Get the hits from this page
        hits = data.get('hits', {}).get('hits', [])
        
        # Update total if this is the first page
        if params['page'] == 1:
            total_records = data.get('hits', {}).get('total', 0)
            print(f"Total records found with UNIBO affiliation: {total_records}")
        
        all_records.extend(hits)
        processed_records += len(hits)
        
        if len(hits) > 0 and processed_records < total_records:
            params['page'] += 1
            time.sleep(1)  
        else:
            continue_pagination = False
    
    print(f"Found {len(all_records)} records with UNIBO affiliation.")
    
    # Searching for records with UNIBO ORCID IDs
    if unibo_orcids:
        print(f"Searching {len(unibo_orcids)} ORCID IDs ")
        
        batch_size = 10  # How many ORCIDs to query at once
        
        total_orcids = len(unibo_orcids)
        total_batches = (total_orcids + batch_size - 1) // batch_size

        for batch_start in range(0, total_orcids, batch_size):
            batch_end = min(batch_start + batch_size, total_orcids)
            batch_orcids = unibo_orcids[batch_start:batch_end]
            
            # Generate ORCID query part for this batch
            orcid_queries = []
            for orcid in batch_orcids:
                # Make sure the ORCID is a string and properly formatted
                if pd.notna(orcid):
                    orcid = str(orcid).strip()
                    orcid_queries.append(f'creators.orcid:"{orcid}" OR contributors.orcid:"{orcid}"')
                
            # Exclude UNIBP affiliations 
            orcid_query = " OR ".join(orcid_queries)
            orcid_params = {
                'q': f'({orcid_query}) AND NOT (contributors.affiliation:"University of Bologna" OR contributors.affiliation:"Università di Bologna" OR '
                    'contributors.affiliation:"UNIBO" OR contributors.affiliation:"Alma Mater Studiorum" OR '
                    'creators.affiliation:"University of Bologna" OR creators.affiliation:"Università di Bologna" OR '
                    'creators.affiliation:"UNIBO" OR creators.affiliation:"Alma Mater Studiorum")',
                'size': 100,
                'page': 1,
            }
            
            # Reset pagination variables for this batch
            batch_continue = True
            orcid_total = 0
            orcid_processed = 0
            
            while batch_continue:
                try:

                    response = requests.get(base_url, params=orcid_params)
                    response.raise_for_status()
                    data = response.json()
                    
                    hits = data.get('hits', {}).get('hits', [])
                    
                    # Update total for this batch if this is the first page
                    if orcid_params['page'] == 1:
                        orcid_total = data.get('hits', {}).get('total', 0)
                        print(f" Records found for this ORCID batch: {orcid_total}")
                    
                    orcid_records.extend(hits)
                    orcid_processed += len(hits)
                    
                    # Check if we should continue to the next page for this batch
                    if len(hits) > 0 and orcid_processed < orcid_total:
                        orcid_params['page'] += 1
                        time.sleep(1)  
                    else:
                        batch_continue = False
                        
                except Exception as e:
                    print(f" Error: {e}")
                    batch_continue = False 

            time.sleep(2)
        
        print(f"Extracted {len(orcid_records)} additional records with ORCIDs.")
        
        all_records.extend(orcid_records)
    
    
    output_file = Path("ZenodoData2.json")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_records, f, indent=2, ensure_ascii=False)
    
    print(f"\nData extraction complete {len(all_records)} total records saved to {output_file}")
    
except Exception as e:
    print(f"Error: {e}")