In [7]:
import pandas as pd

# Load authors dataset
authors_df = pd.read_csv('files/researchers_data_2024.csv')
# Filter authors with works between 5 and 5000
filtered_authors = authors_df[(authors_df['Works Count'] >= 5) & (authors_df['Works Count'] <= 5000)]

In [29]:
import requests
from joblib import Parallel, delayed
from tqdm import tqdm
import time

#############################################
# UTILITY FUNCTIONS
#############################################

def batch_list(lst, batch_size):
    """Yield successive chunks of size batch_size from lst."""
    for i in range(0, len(lst), batch_size):
        yield lst[i:i+batch_size]

#############################################
# API FETCH FUNCTIONS
#############################################

def fetch_works_for_batch(author_ids, per_page=200, sleep_time=0.2):
    """
    Fetch works from OpenAlex for a batch of author IDs.
    
    Parameters:
      - author_ids: list of author OpenAlex IDs (e.g. "https://openalex.org/A123456789")
      - filter_params: additional filter string (e.g. "cited_by_count:>10,authorship_count:<10")
      - per_page: number of works per page (max 200)
      - sleep_time: pause between pages (to respect rate limits)
      
    Returns:
      - List of work dictionaries.
    """
    works_endpoint = "https://api.openalex.org/works"
    all_works = []
    # Build filter for author IDs (using the OR operator "|")
    filter_query = "authorships.author.id:" + "|".join(author_ids)
    filter_query += ",cited_by_count:>10,authors_count:<10"
    filter_query += ",concepts.id:" + "|".join(["https://openalex.org/C144024400", "https://openalex.org/C15744967", "https://openalex.org/C162324750", "https://openalex.org/C17744445", "https://openalex.org/C33923547", "https://openalex.org/C121332964", "https://openalex.org/C41008148"])

    cursor = "*"
    while True:
        params = {
            "filter": filter_query,
            "per_page": per_page,
            "cursor": cursor,
            "select": "id,publication_year,cited_by_count,authorships,abstract_inverted_index,title,concepts"
        }
        response = requests.get(works_endpoint, params=params)
        if response.status_code != 200:
            print(f"Error fetching works for authors {author_ids}: {response.status_code}")
            break
        data = response.json()
        results = data.get("results", [])
        all_works.extend(results)
        meta = data.get("meta", {})
        next_cursor = meta.get("next_cursor")
        if not next_cursor:
            break
        cursor = next_cursor
        time.sleep(sleep_time)
    return all_works

def fetch_works_for_authors(author_ids, batch_size=25, n_jobs=5):
    """
    Given a list of author IDs, split them into batches and fetch works in parallel.
    
    Returns:
      - List of work dictionaries.
    """
    batches = list(batch_list(author_ids, batch_size))
    results = Parallel(n_jobs=n_jobs)(
        delayed(fetch_works_for_batch)(batch) for batch in tqdm(batches, desc="Fetching works for authors")
    )
    # Flatten list of lists
    all_works = [work for sublist in results for work in sublist]
    return all_works

In [None]:
filtered_authors_ids = filtered_authors["ID"].tolist()
len(filtered_authors_ids)

996

In [35]:
all_works = fetch_works_for_authors(filtered_authors_ids)
print(f"Total works fetched for IC2S2 authors: {len(all_works)}")

Fetching works for authors: 100%|██████████| 40/40 [00:35<00:00,  1.14it/s]


Total works fetched for IC2S2 authors: 33548


#### Example query
https://api.openalex.org/works?filter=authorships.author.id:A5082130337|A5014647140,cited_by_count:>10,authors_count:<10,concepts.id:https://openalex.org/C144024400|https://openalex.org/C15744967

In [36]:
def filter_works_by_concepts(works, css_concepts, quantitative_concepts):
    """
    Filter works to include only those that have at least one level-0 concept 
    from each of the following groups:
      - Computational Social Science: e.g. Sociology, Psychology, Economics, Political Science
      - Quantitative disciplines: e.g. Mathematics, Physics, Computer Science
    """
    filtered = []
    # Pre-lowercase the concept names for easier comparison
    css_concepts_lower = [c.lower() for c in css_concepts]
    quantitative_concepts_lower = [q.lower() for q in quantitative_concepts]
    
    for work in works:
        concepts = work.get("concepts", [])
        css_found = False
        quantitative_found = False
        for concept in concepts:
            if concept.get("level") == 0:
                name = concept.get("display_name", "").lower()
                if name in css_concepts_lower:
                    css_found = True
                if name in quantitative_concepts_lower:
                    quantitative_found = True
        if css_found and quantitative_found:
            filtered.append(work)
    return filtered

css_concepts = ["Sociology", "Psychology", "Economics", "Political Science"]
quantitative_concepts = ["Mathematics", "Physics", "Computer Science"]
filtered_works = filter_works_by_concepts(all_works, css_concepts, quantitative_concepts)
print(f"Total works after concept filtering: {len(filtered_works)}")

Total works after concept filtering: 12746


In [38]:
authors_papers = []
authors_abstracts = []

for work in filtered_works:
    # Extract author IDs from the "authorships" field
    authors = [auth.get("author", {}).get("id") for auth in work.get("authorships", []) if auth.get("author", {}).get("id")]
    authors_papers.append({
        "id": work.get("id"),
        "publication_year": work.get("publication_year"),
        "cited_by_count": work.get("cited_by_count"),
        "author_ids": authors
    })
    authors_abstracts.append({
        "id": work.get("id"),
        "title": work.get("title"),
        "abstract_inverted_index": work.get("abstract_inverted_index")
    })

print(len(authors_papers))
authors_papers_df = pd.DataFrame(authors_papers)
authors_abstracts_df = pd.DataFrame(authors_abstracts)

authors_papers_df.to_csv("authors_papers.csv", index=False)
authors_abstracts_df.to_csv("authors_abstracts.csv", index=False)

12746


# EXERCISE 2: Collecting Data from IC2S2 Co-Authors

In [39]:
import ast

#authors_papers_df = pd.read_csv("authors_papers.csv")
authors_papers = authors_papers_df.to_dict(orient='records')
for paper in authors_papers:
    if isinstance(paper['author_ids'], str):
        paper['author_ids'] = ast.literal_eval(paper['author_ids'])

In [40]:
all_author_ids_in_works = set()
for paper in authors_papers:
    for aid in paper["author_ids"]:
        all_author_ids_in_works.add(aid)

# Identify co-author IDs by removing the IC2S2 authors
authors_ids_set = set(filtered_authors_ids)
coauthor_ids = list(all_author_ids_in_works - authors_ids_set)
print(f"Total unique co-author IDs: {len(coauthor_ids)}")

Total unique co-author IDs: 16912


# STEP 3: Fetch Co-Author Details in Bulk

In [None]:
URL = "https://api.openalex.org/authors"
coauthors_data = []

# Use a session to reuse connections
with requests.Session() as session:
    for id in coauthor_ids:
        params = {
            "select": "id,display_name,works_api_url,summary_stats,works_count,last_known_institutions"
        }
        
        response = session.get(URL + f"/{id}", params=params)
        
        if response.status_code == 200:
            results = response.json()
            if results:
                result = results
                author_id = result.get('id', '')
                display_name = result.get('display_name', '')
                works_api_url = result.get('works_api_url', '')
                works_count = result.get('works_count', 0)
                h_index = result.get('summary_stats', 0).get('h_index', 0)
                country_code = result.get('last_known_institutions', '')
                if country_code:
                    country_code = country_code[0].get('country_code', '')

                # Append to list
                coauthors_data.append({
                    "ID": author_id,
                    "Name": display_name,
                    "Works API URL": works_api_url,
                    "Works Count": works_count,
                    "H-Index": h_index,
                    "Country Code": country_code
                })
            else:
                print(f"Error fetching data for {id}")
        else:
            print(f"Error fetching data for {id}: {response.status_code}")

In [41]:
coauthors_df = pd.read_csv("coauthors.csv")
print(len(coauthors_df))
filtered_coauthors_df = coauthors_df[(coauthors_df["Works Count"] >= 5) & (coauthors_df["Works Count"] <= 5000)]
coauthor_ids = filtered_coauthors_df["ID"].tolist()
len(coauthor_ids)

16843


15442

In [42]:
coauthor_ids_set = set(coauthor_ids)
author_ids_set = set(filtered_authors_ids)
allowed_authors = author_ids_set.union(coauthor_ids_set)
len(coauthor_ids_set), len(author_ids_set), len(allowed_authors)

(15442, 994, 16436)

In [43]:
coauthor_ids_1 = coauthor_ids[:5000]
coauthor_ids_2 = coauthor_ids[5000:10000]
coauthor_ids_3 = coauthor_ids[10000:]

In [44]:
coauthor_works_1 = fetch_works_for_authors(coauthor_ids_1)
print(f"Total works fetched for IC2S2 authors: {len(coauthor_works_1)}")

Fetching works for authors: 100%|██████████| 200/200 [07:37<00:00,  2.29s/it]


Total works fetched for IC2S2 authors: 222380


In [45]:
coauthor_works_2 = fetch_works_for_authors(coauthor_ids_2)
print(f"Total works fetched for IC2S2 authors: {len(coauthor_works_2)}")

Fetching works for authors: 100%|██████████| 200/200 [09:48<00:00,  2.94s/it]


Total works fetched for IC2S2 authors: 226306


In [49]:
coauthor_works_3 = fetch_works_for_authors(coauthor_ids_3)
print(f"Total works fetched for IC2S2 authors: {len(coauthor_works_3)}")


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Fetching works for authors:  72%|███████▎  | 29/40 [56:58<21:36, 117.87s/it]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Fetching works for authors: 100%|██████████| 218/218 [10:07<00:00,  2.79s/it]


Total works fetched for IC2S2 authors: 243152


In [50]:
all_coauthor_works = []
all_coauthor_works.extend(coauthor_works_1)
all_coauthor_works.extend(coauthor_works_2)
all_coauthor_works.extend(coauthor_works_3)
len(all_coauthor_works)

691838

In [51]:
filtered_coauthor_works = filter_works_by_concepts(all_coauthor_works, css_concepts, quantitative_concepts)
print(f"Total works after concept filtering: {len(filtered_coauthor_works)}")

Total works after concept filtering: 264199


In [62]:
allowed_coauthors = list(allowed_authors)

def filter_coauthor_ids(author_ids, allowed_set):
    """
    Given a list of author IDs (e.g., "https://openalex.org/A123456789"),
    keep only those whose ID part (after "org/") is in allowed_set.
    """
    filtered = []
    for aid in author_ids:
        # Extract the ID part if the URL is provided
        if aid in allowed_set:
            filtered.append(aid)
    return filtered

coauthors_papers = []
coauthors_abstracts = []

for work in filtered_coauthor_works:
    # Extract author IDs from the "authorships" field
    authors = [auth.get("author", {}).get("id") for auth in work.get("authorships", []) if auth.get("author", {}).get("id")]

    filtered_ids = filter_coauthor_ids(authors, allowed_coauthors)

    if len(filtered_ids) > 1:
        coauthors_papers.append({
            "id": work.get("id"),
            "publication_year": work.get("publication_year"),
            "cited_by_count": work.get("cited_by_count"),
            "author_ids": filtered_ids
        })
        coauthors_abstracts.append({
            "id": work.get("id"),
            "title": work.get("title"),
            "abstract_inverted_index": work.get("abstract_inverted_index")
        })

In [63]:
len(coauthors_papers), len(coauthors_abstracts)

(101488, 101488)

In [64]:
coauthors_papers_df = pd.DataFrame(coauthors_papers)
coauthors_abstracts_df = pd.DataFrame(coauthors_abstracts)

coauthors_papers_df.to_csv("coauthors_papers.csv", index=False)
coauthors_abstracts_df.to_csv("coauthors_abstracts.csv", index=False)

# Combining authors dataset

In [65]:
print("Filtered authors papers:", authors_papers_df.shape)
print("authors abstracts:", authors_abstracts_df.shape)
print("Filtered co-authors papers:", coauthors_papers_df.shape)
print("Co-authors abstracts:", coauthors_abstracts_df.shape)

Filtered authors papers: (12746, 4)
authors abstracts: (12746, 3)
Filtered co-authors papers: (101488, 4)
Co-authors abstracts: (101488, 3)


### (A) Combine authors: concatenate IC2S2 authors and co-authors dataframes and remove duplicates.

In [66]:
combined_authors_df = pd.concat([filtered_authors, filtered_coauthors_df], ignore_index=True)
combined_authors_df.drop_duplicates(subset=["ID"], inplace=True)
combined_authors_df.to_csv("authors_combined.csv", index=False)
print("Combined authors dataset shape:", combined_authors_df.shape)

Combined authors dataset shape: (16436, 6)


### (B) Combine papers: concatenate IC2S2 papers and co-authors papers, drop duplicate works,

In [67]:
import ast


#     and remove papers with only one author.
# Note: If your "author_ids" column is stored as a string (e.g., as a list repr), convert it back to list.
def parse_author_ids(author_ids):
    if isinstance(author_ids, str):
        try:
            return ast.literal_eval(author_ids)
        except Exception:
            return []
    return author_ids

# Ensure the author_ids column is in list format for each paper.
for df in [authors_papers_df, coauthors_papers_df]:
    if df["author_ids"].dtype == object:
        df["author_ids"] = df["author_ids"].apply(parse_author_ids)

In [73]:
combined_papers_df = pd.concat([authors_papers_df, coauthors_papers_df], ignore_index=True)
combined_papers_df.drop_duplicates(subset=["id"], inplace=True)
# Remove papers with only one author
#combined_papers_df = combined_papers_df[combined_papers_df["author_ids"].apply(lambda x: len(x) > 1)]
combined_papers_df.to_csv("papers_combined.csv", index=False)
print("Combined papers dataset shape:", combined_papers_df.shape)

Combined papers dataset shape: (43341, 4)


### (C) Combine abstracts: simply concatenate and drop duplicates.

In [74]:
combined_abstracts_df = pd.concat([authors_abstracts_df, coauthors_abstracts_df], ignore_index=True)
print(combined_abstracts_df.shape)
combined_abstracts_df.drop_duplicates(subset=["id"], inplace=True)
combined_abstracts_df.to_csv("abstracts_combined.csv", index=False)
print("Combined abstracts dataset shape:", combined_abstracts_df.shape)

(114234, 3)
Combined abstracts dataset shape: (43341, 3)
