In [None]:
import requests
import pandas as pd
import tqdm
import unicodedata
import ast

def normalize_name(name):
    """
    returns more standard version of name
        (removes accents, special characters, and makes lowercase)
    """

    normalized_name = ''.join(c for c in unicodedata.normalize('NFD', name) if unicodedata.category(c) != 'Mn' and (c.isalnum() or c.isspace()))
    return normalized_name.lower()


def get_info_from_names(df_names):
    """
    Retrieves author information from names (through a search)
    """

    BASE_URL = 'https://api.openalex.org'
    RESOURCE = '/authors'

    data = []
    for i in tqdm.tqdm(range(len(df_names))):
        try:
            response = requests.get(BASE_URL + RESOURCE, params={"search": df_names["name"][i]})
            all_results = response.json()["results"]#[0]

            names = []
            normalized_df_name = normalize_name(df_names["name"][i])

            for r in range(len(all_results)):
                name = all_results[r]["display_name"]
                names.append(normalize_name(name))

                if normalized_df_name in names:
                    results = response.json()["results"][r]
                    break

            if normalized_df_name not in names:
                continue

            data += [[results["id"], results["display_name"], results["works_api_url"], results["summary_stats"]["h_index"], results["works_count"], results["last_known_institution"]["country_code"]]]

        except (IndexError, KeyError, ValueError, TypeError) as e:
            print(f"Skipping data point at iteration {i} due to error: {e}")
            continue  # Skip the rest of the loop and proceed to the next iteration
        except requests.exceptions.RequestException as e:
            print(f"Request error at iteration {i}: {e}")
            continue

    new_df = pd.DataFrame(data, columns=["id","display_name","works_api_url","h_index","works_count", "country_code"])
    return new_df

def get_info_from_author_ids(author_ids):
    """
    Gets info table from author ids.
    """

    URL = 'https://api.openalex.org/authors'
    author_ids = [id.split('/')[-1] for id in author_ids]

    co_author_info = []
    id_batches = [author_ids[i:i+25] for i in range(0, len(author_ids), 25)]

    for i,ids_batch in tqdm.tqdm(enumerate(id_batches)):
            
        # search for result
        params = {'filter': 'ids.openalex:'+'|'.join(ids_batch)}
        result_batch = requests.get(URL, params=params).json()

        # No results
        if len(result_batch) == 0:
            continue
        
        # Go through results (authors)
        for result in result_batch['results']:

            # if person doesnt have all info, skip person
            try:
                # extract desired information
                person_info = [result['id'], result['display_name'], result['works_api_url'], result['summary_stats']['h_index'], result['works_count'], result['last_known_institution']['country_code']]
            except:
                continue

            co_author_info.append(person_info)
    
    co_author_info_df = pd.DataFrame(co_author_info, columns=['id', 'display_name', 'works_api_url', 'h_index', 'works_count', 'country_code'])

    return co_author_info_df

def get_concept_ids(concept_requirements):
    """
    Retrieves the OpenAlex concept_ids for a list of requirements.
    """

    BASE_URL = 'https://api.openalex.org'
    RESOURCE = '/concepts'

    concept_ids = []
    for concept in concept_requirements:
        result = requests.get(BASE_URL + RESOURCE, params={'search': concept, 'filter': 'level:0'}).json()
        concept_ids.append(result['results'][0]['id'])

    concept_ids = [id.split("/")[-1] for id in concept_ids]
    return concept_ids


def get_articles_from_authors(names, concept_ids_requirements_1, concept_ids_requirements_2, subset=False):
    """
    Extracts articles from authors in the names table.
        The articles are filtered by the criteria from the assignment description.
    """
    
    BASE_URL = 'https://api.openalex.org'
    RESOURCE = '/works'

    # Filter out authors not having 5-5000 works
    names = names[(names['works_count']>=5) & (names['works_count']<=5000)]

    table1 = []
    table2 = []

    # Search for articles in batches of 25 authors
    name_batches = [list(names['id'][i:i+25]) for i in range(0, len(names), 25)]

    for num_name_batch, name_batch in tqdm.tqdm(enumerate(name_batches)):

        # short version for testing
        if subset and num_name_batch>0:
            break

        # Scroll through the results
        cursor = '*'
        while True:
            filters = ['cited_by_count:>10', 
                        'authors_count:<10',
                        'authorships.author.id:'+'|'.join(name_batch),
                        'concepts.id:'+'|'.join(concept_ids_requirements_1),
                        'concepts.id:'+'|'.join(concept_ids_requirements_2)
                        ]
            parameters = {'per-page': 200,
                            'filter': ','.join(filters),
                            'cursor': cursor
                            }
            result = requests.get(BASE_URL + RESOURCE, params=parameters).json()

            # If last page is reached (which is empty), break 
            cursor = result['meta']['next_cursor'] # next page for next search
            if len(result['results'])==0 or cursor is None:
                break

            # Go through all articles and extract information
            for n_article,article in enumerate(result['results']):
                try:
                    tab1 = [article['id'], article['publication_year'], article['cited_by_count'], [author['author']['id'] for author in article['authorships']]]
                    tab2 = [article['id'], article['title'], article['abstract_inverted_index']]
                    table1.append(tab1)
                    table2.append(tab2)

                except:
                    print("skipped name batch:", num_name_batch, "article:", n_article)
                    continue
    
    table1 = pd.DataFrame(table1, columns=['id', 'publication_year', 'cited_by_count', 'authors'])
    table2 = pd.DataFrame(table2, columns=['id', 'title', 'abstract_inverted_index'])

    return table1, table2


Turning names-list into the author dataset and saving as CSV

In [None]:
# First, turn names-list from Part 1 into a pandas dataframe
names = pd.read_csv('data/names.csv')

# Get info from names (table of authors + info)
authors = get_info_from_names(names)

# Drop duplicates
authors = authors.drop_duplicates(subset='id')

# Save authors to csv
authors.to_csv("data/authors.csv", index=False)

Expand the author dataset with co-authors and get the final author dataset

In [None]:
# load authors and papers
authors = pd.read_csv('data/authors.csv')
papers = pd.read_csv('data/papers.csv')

# Find the co-author IDs
author_ids = authors["id"]
co_authors_ids = [x for x in papers["authors"].copy().apply(ast.literal_eval).explode().dropna().unique() if x not in author_ids]

# Get info from author ids
co_author_info = get_info_from_author_ids(co_authors_ids)

# Drop rows in co_author_info if "nan" in country_code
co_author_info = co_author_info.dropna(subset=['country_code'])

# Concatenate the original authors and the co-authors dataframes
authors_final = pd.concat([authors, co_author_info]).drop_duplicates(subset=['id'])

# Drop duplicates
co_author_info = co_author_info.drop_duplicates(subset='id')

# Save authors to csv
co_author_info.to_csv("data/co_authors.csv", index=False)
authors_final.to_csv("data/authors_final.csv", index=False)


Find only the co-authors works and combine them with the original authors' works

In [None]:
# Define concept ids
concepts_requirements_1 = ['Sociology', 'Psychology', 'Economics', 'Political Science']
concepts_requirements_2 = ['Mathematics', 'Physics', 'Computer Science']
concept_ids_1 = get_concept_ids(concepts_requirements_1)
concept_ids_2 = get_concept_ids(concepts_requirements_2)

# Get papers and abstracts from co-authors
papers_from_co_authors, abstracts_from_co_authors = get_articles_from_authors(co_author_info, concept_ids_1, concept_ids_2, subset=False)

# Concatenate the original papers/abstracts with co-authored papers/abstracts and drop duplicates
papers = pd.read_csv('data/papers.csv')
abstracts = pd.read_csv('data/abstracts.csv')
papers_final = pd.concat([papers, papers_from_co_authors]).drop_duplicates(subset=['id'])
abstracts_final = pd.concat([abstracts, abstracts_from_co_authors]).drop_duplicates(subset=['id'])

# Save papers and abstracts to csv
papers_final.to_csv("data/papers_final.csv", index=False)
# The final abstracts dataset is not saved in our Github, as it is very large.
# abstracts_final.to_csv("data/abstracts_final.csv", index=False)


Remove authors who are not either IC2S2 authors or IC2S2 co-authors.

In [None]:
papers_final = pd.read_csv("data/papers_final.csv")
authors_final = pd.read_csv("data/authors_final.csv")

# Drop duplicates from papers
papers_final = papers_final.drop_duplicates(subset=['id'])

papers_final_copy = papers_final.copy()

# Drop authors that are not in the authors_final dataframe
for i, authors_list in tqdm.tqdm(enumerate(papers_final_copy["authors"].apply(ast.literal_eval))):
    new_list = []
    for author in authors_list:
        if author in authors_final["id"].values:
            new_list.append(author)
            
    papers_final_copy.at[i, "authors"] = new_list

# Save papers to csv
papers_final_copy.to_csv("data/papers_final.csv", index=False)