In [1]:
import requests
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm
import time

#############################################
# UTILITY FUNCTIONS
#############################################

def batch_list(lst, batch_size):
    """Yield successive chunks of size batch_size from lst."""
    for i in range(0, len(lst), batch_size):
        yield lst[i:i+batch_size]

#############################################
# API FETCH FUNCTIONS
#############################################

def fetch_works_for_batch(author_ids, filter_params="", per_page=200, sleep_time=0.2):
    """
    Fetch works from OpenAlex for a batch of author IDs.
    
    Parameters:
      - author_ids: list of author OpenAlex IDs (e.g. "https://openalex.org/A123456789")
      - filter_params: additional filter string (e.g. "cited_by_count:>10,authorship_count:<10")
      - per_page: number of works per page (max 200)
      - sleep_time: pause between pages (to respect rate limits)
      
    Returns:
      - List of work dictionaries.
    """
    works_endpoint = "https://api.openalex.org/works"
    all_works = []
    # Build filter for author IDs (using the OR operator "|")
    author_filter = "authorships.author.id:" + "|".join(author_ids)
    full_filter = author_filter + ("," + filter_params if filter_params else "")
    
    cursor = "*"
    while True:
        params = {
            "filter": full_filter,
            "per_page": per_page,
            "cursor": cursor,
            "select": "id,publication_year,cited_by_count,authorships,abstract_inverted_index,title,concepts"
        }
        response = requests.get(works_endpoint, params=params)
        if response.status_code != 200:
            print(f"Error fetching works for authors {author_ids}: {response.status_code}")
            break
        data = response.json()
        results = data.get("results", [])
        all_works.extend(results)
        meta = data.get("meta", {})
        next_cursor = meta.get("next_cursor")
        if not next_cursor:
            break
        cursor = next_cursor
        time.sleep(sleep_time)
    return all_works

def fetch_works_for_authors(author_ids, filter_params="", batch_size=25, n_jobs=5):
    """
    Given a list of author IDs, split them into batches and fetch works in parallel.
    
    Returns:
      - List of work dictionaries.
    """
    batches = list(batch_list(author_ids, batch_size))
    results = Parallel(n_jobs=n_jobs)(
        delayed(fetch_works_for_batch)(batch, filter_params) for batch in tqdm(batches, desc="Fetching works for authors")
    )
    # Flatten list of lists
    all_works = [work for sublist in results for work in sublist]
    return all_works

In [3]:
ic2s2_authors_df = pd.read_csv("files/ic2s2_coauthors.csv")

ic2s2_authors_df = ic2s2_authors_df[(ic2s2_authors_df["Works Count"] >= 5) & (ic2s2_authors_df["Works Count"] <= 5000)]
ic2s2_author_ids = ic2s2_authors_df["ID"].tolist()
ic2s2_author_ids = [author_id.split("org/")[1] for author_id in ic2s2_author_ids]

# all_works = fetch_works_for_authors(ic2s2_author_ids, filter_params="cited_by_count:>10", batch_size=25, n_jobs=5)
# print(f"Total works fetched for IC2S2 authors: {len(all_works)}")
len(ic2s2_author_ids)

10479