In [49]:
import pandas as pd
import requests
from time import sleep
from json import JSONDecodeError
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

In [50]:
# Read authors CSV file
csv_path = 'C:/Users/marcu/Documents/Computational social science/Week 2/Authors.csv'
authors = pd.read_csv(csv_path)

authors

Unnamed: 0,id,display_name,works_api_url,works_count,country_code,h_index
0,https://openalex.org/A5026829784,Sam Corbett‐Davies,https://api.openalex.org/works?filter=author.i...,34,IL,14
1,https://openalex.org/A5066089123,Byungkyu Lee,https://api.openalex.org/works?filter=author.i...,186,US,13
2,https://openalex.org/A5100435139,Jingwen Zhang,https://api.openalex.org/works?filter=author.i...,399,CN,47
3,https://openalex.org/A5002073039,Sou Hyun Jang,https://api.openalex.org/works?filter=author.i...,69,KR,11
4,https://openalex.org/A5091610280,Carl Colglazier,https://api.openalex.org/works?filter=author.i...,4,US,1
...,...,...,...,...,...,...
1145,https://openalex.org/A5043405308,Yen-Huei Chen,https://api.openalex.org/works?filter=author.i...,29,TW,13
1146,https://openalex.org/A5021090586,Michael Lees,https://api.openalex.org/works?filter=author.i...,226,NL,28
1147,https://openalex.org/A5102918288,Jiayu Zheng,https://api.openalex.org/works?filter=author.i...,35,CN,13
1148,https://openalex.org/A5065295188,Yang Tian,https://api.openalex.org/works?filter=author.i...,450,CN,63


In [51]:
# Split authors into batches (each batch roughly 25 authors)
author_batches = np.array_split(authors, len(authors) // 25)
author_batches[0]

  return bound(*args, **kwds)


Unnamed: 0,id,display_name,works_api_url,works_count,country_code,h_index
0,https://openalex.org/A5026829784,Sam Corbett‐Davies,https://api.openalex.org/works?filter=author.i...,34,IL,14
1,https://openalex.org/A5066089123,Byungkyu Lee,https://api.openalex.org/works?filter=author.i...,186,US,13
2,https://openalex.org/A5100435139,Jingwen Zhang,https://api.openalex.org/works?filter=author.i...,399,CN,47
3,https://openalex.org/A5002073039,Sou Hyun Jang,https://api.openalex.org/works?filter=author.i...,69,KR,11
4,https://openalex.org/A5091610280,Carl Colglazier,https://api.openalex.org/works?filter=author.i...,4,US,1
5,https://openalex.org/A5071422618,Markus Strohmaier,https://api.openalex.org/works?filter=author.i...,348,DE,38
6,https://openalex.org/A5044191812,Vinícius Andrade Brei,https://api.openalex.org/works?filter=author.i...,63,BR,14
7,https://openalex.org/A5019023655,Alyssa Smith,https://api.openalex.org/works?filter=author.i...,4,MX,2
8,https://openalex.org/A5037451955,Alexander Furnas,https://api.openalex.org/works?filter=author.i...,51,US,6
9,https://openalex.org/A5010577211,Dehao Zhang,https://api.openalex.org/works?filter=author.i...,2,CN,1


In [52]:
# Define DataFrame column names
paper_col = ['id', 'publication_year', 'cited_by_count', 'author_ids']
abstract_col = ['id', 'title', 'abstract_inverted_index']

# Global API parameters
source = 'https://api.openalex.org/works'
concepts_1 = {
    'Computer science': 'https://openalex.org/C41008148',
    'Physics': 'https://openalex.org/C121332964',
    'Mathematics': 'https://openalex.org/C33923547',
}
concepts_2 = {
    'Psychology': 'https://openalex.org/C15744967',
    'Sociology': 'https://openalex.org/C144024400',
    'Economics': 'https://openalex.org/C162324750',
    'Political science': 'https://openalex.org/C17744445',
}

In [53]:
def process_batch(author_batch, batch_index):
    """
    Process a single batch of authors by iterating over API pages,
    applying retry logic, and collecting papers and abstracts.
    """
    local_session = requests.Session()  # create a local session for the thread
    local_papers_data = []
    local_abstracts_data = []
    page = 1

    while True:
        tries = 0
        results = None
        # Retry loop for API request
        while tries < 10:
            try:
                # Prepare the filter by joining author ids
                author_ids_str = '|'.join(author_batch['id'].astype(str))
                url = (
                    f"{source}?filter=author.id:{author_ids_str},"
                    f"cited_by_count:>5,authors_count:<10,"
                    f"concepts.id:{'|'.join(concepts_1.values())},"
                    f"concepts.id:{'|'.join(concepts_2.values())}"
                )
                response = local_session.get(url, params={'per_page': 200, 'page': page})
                response.raise_for_status()
                results = response.json().get('results', [])
                break  # exit retry loop on success
            except (JSONDecodeError, requests.exceptions.RequestException) as e:
                tries += 1
                sleep(0.1)
        # If no results are returned, exit the paging loop
        if not results:
            break

        # Process each paper in the current page
        for paper in results:
            paper_id = paper['id']
            publication_year = paper.get('publication_year', None)
            cited_by_count = paper.get('cited_by_count', 0)
            # Extract author IDs from the authorships list
            author_ids = ";".join(sub_author['author']['id'] for sub_author in paper['authorships'])
            local_papers_data.append([paper_id, publication_year, cited_by_count, author_ids])

            title = paper.get('title', '')
            abstract_index = paper.get('abstract_inverted_index', None)
            local_abstracts_data.append([paper_id, title, abstract_index])

        page += 1

    print(f"Completed batch {batch_index}")
    return local_papers_data, local_abstracts_data

In [54]:
# Multi-threaded execution using ThreadPoolExecutor
all_papers_data = []
all_abstracts_data = []

# You can adjust max_workers based on your system and API limits
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit each batch to be processed in a separate thread
    future_to_batch = {
        executor.submit(process_batch, batch, idx): idx
        for idx, batch in enumerate(author_batches)
    }
    # As each future completes, combine its results
    for future in as_completed(future_to_batch):
        try:
            papers_data, abstracts_data = future.result()
            all_papers_data.extend(papers_data)
            all_abstracts_data.extend(abstracts_data)
        except Exception as exc:
            batch_index = future_to_batch[future]
            print(f"Batch {batch_index} generated an exception: {exc}")

Completed batch 4
Completed batch 1
Completed batch 9
Completed batch 3
Completed batch 7
Completed batch 6
Completed batch 0
Completed batch 8
Completed batch 2
Completed batch 5
Completed batch 11
Completed batch 15
Completed batch 10
Completed batch 16
Completed batch 14
Completed batch 19
Completed batch 12
Completed batch 17
Completed batch 22
Completed batch 13
Completed batch 18
Completed batch 21
Completed batch 23
Completed batch 26
Completed batch 20
Completed batch 24
Completed batch 25
Completed batch 28
Completed batch 33
Completed batch 30
Completed batch 29
Completed batch 27
Completed batch 34
Completed batch 32
Completed batch 36
Completed batch 35
Completed batch 40
Completed batch 37
Completed batch 42
Completed batch 39
Completed batch 31
Completed batch 38
Completed batch 45
Completed batch 44
Completed batch 43
Completed batch 41


In [55]:
# Convert the collected data into DataFrames
papers = pd.DataFrame(all_papers_data, columns=paper_col)
abstracts = pd.DataFrame(all_abstracts_data, columns=abstract_col)

In [56]:
# Output final papers DataFrame
print(len(papers), len(abstracts))

16497 16497


In [60]:
papers = papers.drop_duplicates(['id'], ignore_index=True)
papers

Unnamed: 0,id,publication_year,cited_by_count,author_ids
0,https://openalex.org/W3103362336,2009,7042,https://openalex.org/A5014647140;https://opena...
1,https://openalex.org/W2047940964,2004,6955,https://openalex.org/A5014647140;https://opena...
2,https://openalex.org/W2157082398,2008,2112,https://openalex.org/A5014647140;https://opena...
3,https://openalex.org/W2061099285,2010,927,https://openalex.org/A5069948947;https://opena...
4,https://openalex.org/W2095072199,2005,795,https://openalex.org/A5014647140
...,...,...,...,...
14602,https://openalex.org/W593155841,1988,7,https://openalex.org/A5002897462;https://opena...
14603,https://openalex.org/W4239569444,2002,6,https://openalex.org/A5084116084;https://opena...
14604,https://openalex.org/W151312201,1992,7,https://openalex.org/A5007176508
14605,https://openalex.org/W2058529988,1993,7,https://openalex.org/A5007176508


In [61]:
abstracts = abstracts.drop_duplicates(['id'], ignore_index=True)
abstracts

Unnamed: 0,id,title,abstract_inverted_index
0,https://openalex.org/W3103362336,Power-Law Distributions in Empirical Data,"{'Power-law': [0], 'distributions': [1], 'occu..."
1,https://openalex.org/W2047940964,Finding community structure in very large netw...,"{'The': [0, 147], 'discovery': [1], 'and': [2,..."
2,https://openalex.org/W2157082398,Hierarchical structure and the prediction of m...,
3,https://openalex.org/W2061099285,Performance of modularity maximization in prac...,"{'Although': [0], 'widely': [1], 'used': [2], ..."
4,https://openalex.org/W2095072199,Finding local community structure in networks,"{'Although': [0], 'the': [1, 18, 26, 46, 57, 7..."
...,...,...,...
14602,https://openalex.org/W593155841,AN INVESTIGATION INTO THE USE AND USEFULNESS O...,"{'Computer': [0], 'security': [1, 28, 78, 114,..."
14603,https://openalex.org/W4239569444,Transfer of Information Technology to the Arab...,"{'The': [0, 62, 131, 162], 'complex': [1, 154]..."
14604,https://openalex.org/W151312201,MODAL DESCRIPTIONS FOR RECOGNITION AND TRACKING,
14605,https://openalex.org/W2058529988,Surface Interpolation Networks,"{'Orthogonal': [0], 'wavelets': [1], 'can': [2..."


In [64]:
author_ids = ';'.join(papers['author_ids'])
author_ids = author_ids.split(';')

len(author_ids)

54837

In [65]:
len( list(set(author_ids)) )

21875