# Query and Process First Author Publications

#### Updated: Sep 27, 2022

#  

Query and process subset of first author publications for a country of interest, over the past 5 years. Make functions generalizable for any country. 

In [14]:
import pandas as pd
import numpy as np
from datetime import date
import ast
import requests
import time
import os

#  

#### Functions:

In [15]:
def construct_query_string_cursor(endpoint, filters, cursor='*'):
    
    filter_param = f'filter={",".join(filters)}'
    filtered_works_url = f'https://api.openalex.org/{endpoint}?mailto=naidoo@ualberta.ca&{filter_param}&per-page=200&cursor={cursor}'
    print(filtered_works_url)
    
    return filtered_works_url

In [16]:
def isolate_country_first_auth_pubs(country_code, df_resp):

    country_first_indicies = []
    for i in range(0, len(df_resp)):
        authorships_dict = df_resp['authorships'][i]
        for j in range(0, len(authorships_dict)):
            if authorships_dict[j]['author_position'] == 'first': 
                first_auth_insts = authorships_dict[j]['institutions']
                if len(first_auth_insts) > 0:
                    for k in range(0, len(first_auth_insts)): 
                        if len(first_auth_insts[k]) > 0:
                            if first_auth_insts[k]['country_code'] == country_code:
                                country_first_indicies.append(i)
                        
    df_country_first = df_resp.iloc[country_first_indicies]
    
    return df_country_first

#  

#### Select the bilateral relationship, and timeframe of interest:

In [17]:
country_code = 'GB'
country_code_collab = 'ES'
years_back = 5

In [18]:
dataDir = '/Users/rnaidoo/Documents/Canada-Secure/GAC/2022_MDRID/Projects_data/OpenAlex/works_' + country_code + '_first_auth/'
dataDir_col = dataDir + country_code_collab + '_col/'
os.makedirs(dataDir_col, exist_ok=True)

In [19]:
current_year = date.today().year
year_i = current_year - years_back
year_f = current_year + 1

In [20]:
year_i

2017

In [21]:
year_f

2023

#  

#### Query yearly batches, for the last 5 years.

In [None]:
for year in range(year_i, year_f): #range(2017, 2023)

    endpoint = 'works'
    filters = [('institutions.country_code:' + country_code), 'publication_year:' + str(year)] #'publication_year:>2017'
    save_filename = 'works_' + country_code + '_' + str(year) + '.csv'
    batch_size = 50

    start = time.time()

    #Start cursor pagination
    filtered_works_url = construct_query_string_cursor(endpoint=endpoint, filters=filters)
    response = requests.get(filtered_works_url)
    response_count = response.json()['meta']['count']
    print('Total number of results: ' + str(response_count))
    pages = round(np.floor(response_count/200))
    remainder = np.remainder(response_count, 200)
    print('Processing ' + str(pages) + ' additional pages...')
    df_response = pd.DataFrame(response.json()['results'])
    df_country = isolate_country_first_auth_pubs(country_code=country_code, df_resp=df_response)
    df_country.to_csv(dataDir + save_filename, index=False)
    next_cursor = response.json()['meta']['next_cursor']

    #Process batches of pages
    batches = round(np.floor(pages/batch_size))
    batch_remainder = np.remainder(pages, batch_size)

    #Loop through pages in batch
    for i in range(0, batches):
        df_batch = pd.DataFrame()
        for j in range(0, batch_size):
            filtered_works_url = construct_query_string_cursor(endpoint=endpoint, filters=filters, cursor=next_cursor)
            response = requests.get(filtered_works_url)
            df_response = pd.DataFrame(response.json()['results'])
            df_resp_proc = isolate_country_first_auth_pubs(country_code=country_code, df_resp=df_response)
            df_batch = pd.concat([df_batch, df_resp_proc])
            next_cursor = response.json()['meta']['next_cursor']
        df_country_load = pd.read_csv(dataDir + save_filename)
        df_country = pd.concat([df_country_load, df_batch])
        df_country.to_csv(dataDir + save_filename, index=False)

    #Loop through pages in final batch
    df_batch = pd.DataFrame()
    for i in range(0, batch_remainder):
        filtered_works_url = construct_query_string_cursor(endpoint=endpoint, filters=filters, cursor=next_cursor)
        response = requests.get(filtered_works_url)
        df_response = pd.DataFrame(response.json()['results'])
        df_resp_proc = isolate_country_first_auth_pubs(country_code=country_code, df_resp=df_response)
        df_batch = pd.concat([df_batch, df_resp_proc])
        next_cursor = response.json()['meta']['next_cursor']
    df_country_load = pd.read_csv(dataDir + save_filename)
    df_country = pd.concat([df_country_load, df_batch])
    df_country.to_csv(dataDir + save_filename, index=False)

    end = time.time()   
    t = end - start
    print('Runtime: ' + str(round(t/60, 1)) + ' min')

#  

#### Identify those publications with collaborators from the other country of interest:

In [8]:
for file in os.listdir(dataDir):
    if 'works_' + country_code + '_' in file:
        print('Processing... ' + file)
        df1 = pd.read_csv(dataDir + file)
        df1['first_author'] = pd.Series()
        df1['first_auth_orcid'] = pd.Series()
        df1['first_auth_inst'] = pd.Series()
        df1['first_auth_country'] = pd.Series()
        df1['collaborators_of_interest'] = pd.Series()
        
        for i in range(0, len(df1)):
            auth_list_ = df1.iloc[i]['authorships']
            auth_list = ast.literal_eval(auth_list_)

            #Get first author details
            if auth_list[0]['author_position'] == 'first':
                if len(auth_list[0]['institutions'][0]) > 0:
                    if auth_list[0]['institutions'][0]['country_code'] == country_code:
                        if len(auth_list[0]['author']) > 0:
                            first_auth = auth_list[0]['author']['display_name']
                            first_auth_orcid = auth_list[0]['author']['orcid']
                            first_auth_inst = auth_list[0]['institutions'][0]['display_name']
                            first_auth_country = auth_list[0]['institutions'][0]['country_code']

            #Get collaborator authors' details
            collab_list = {}
            df_collabs = pd.DataFrame()
            for auth in auth_list:
                if len(auth) > 0:
                    for inst in auth['institutions']:
                        if len(inst) > 0:
                            if inst['country_code'] == country_code_collab:
                                if len(auth['author']) > 0:
                                    col_auth = auth['author']['display_name']
                                    col_auth_orcid = auth['author']['orcid']
                                    col_auth_inst = inst['display_name']
                                    col_auth_country = inst['country_code']
                                    col_auth_dict = {
                                        'col_auth_orcid': col_auth_orcid,
                                        'col_auth_inst': col_auth_inst,
                                        'col_auth_country': col_auth_country
                                    }
                                    collab_list[col_auth] = col_auth_dict

            #Add columns
            df1.iloc[i,26] = first_auth
            df1.iloc[i,27] = first_auth_orcid
            df1.iloc[i,28] = first_auth_inst
            df1.iloc[i,29] = first_auth_country
            df1.iloc[i,30] = str(collab_list)
        
        df1.to_csv(dataDir_col + file[:-4] + '_' + country_code_collab + '_col_proc.csv', index=False)

Processing... works_GB_2017.csv


  df1['first_author'] = pd.Series()
  df1['first_auth_orcid'] = pd.Series()
  df1['first_auth_inst'] = pd.Series()
  df1['first_auth_country'] = pd.Series()
  df1['collaborators_of_interest'] = pd.Series()


Processing... works_GB_2019.csv


  df1['first_author'] = pd.Series()
  df1['first_auth_orcid'] = pd.Series()
  df1['first_auth_inst'] = pd.Series()
  df1['first_auth_country'] = pd.Series()
  df1['collaborators_of_interest'] = pd.Series()


Processing... works_GB_2018.csv


  df1['first_author'] = pd.Series()
  df1['first_auth_orcid'] = pd.Series()
  df1['first_auth_inst'] = pd.Series()
  df1['first_auth_country'] = pd.Series()
  df1['collaborators_of_interest'] = pd.Series()


Processing... works_GB_2020.csv


  df1['first_author'] = pd.Series()
  df1['first_auth_orcid'] = pd.Series()
  df1['first_auth_inst'] = pd.Series()
  df1['first_auth_country'] = pd.Series()
  df1['collaborators_of_interest'] = pd.Series()


Processing... works_GB_2021.csv


  df1['first_author'] = pd.Series()
  df1['first_auth_orcid'] = pd.Series()
  df1['first_auth_inst'] = pd.Series()
  df1['first_auth_country'] = pd.Series()
  df1['collaborators_of_interest'] = pd.Series()


Processing... works_GB_2022.csv


  df1['first_author'] = pd.Series()
  df1['first_auth_orcid'] = pd.Series()
  df1['first_auth_inst'] = pd.Series()
  df1['first_auth_country'] = pd.Series()
  df1['collaborators_of_interest'] = pd.Series()


Combine yearly batches, remove duplicate publications, split by year again:

In [10]:
df_total = pd.DataFrame()
for file in os.listdir(dataDir_col):
    if 'works_' + country_code in file:
        if '_proc' in file:
            df_load = pd.read_csv(dataDir_col + file)
            print('Opening ' + file + '...rows: ' + str(len(df_load)))
            df_total = pd.concat([df_total, df_load])
            
df_total = df_total.sort_values('publication_date')
df1 = df_total.drop_duplicates(subset=['id'])
df1.to_csv(dataDir_col + 'works_' + country_code + '_first_auth_' + country_code_collab + '_col_since2017.csv', index=False)

df1 = pd.read_csv(dataDir_col + 'works_' + country_code + '_first_auth_' + country_code_collab + '_col_since2017.csv')
pub_years = df1['publication_year'].unique()
for year in pub_years:
    df1_y = df1.loc[df1['publication_year'] == year]
    df1_y.to_csv(dataDir_col + 'works_' + country_code + '_first_auth_' + str(year) + '_' + country_code_collab + '_col_proc2.csv', index=False)

Opening works_GB_2018_ES_col_proc.csv...rows: 153311
Opening works_GB_2017_ES_col_proc.csv...rows: 156015
Opening works_GB_2022_ES_col_proc.csv...rows: 112760
Opening works_GB_2019_ES_col_proc.csv...rows: 161160
Opening works_GB_2021_ES_col_proc.csv...rows: 187462
Opening works_GB_2020_ES_col_proc.csv...rows: 172384


#  

#### Process data to surface concepts related to each publication, by year:

In [None]:
for year in range(year_i, year_f): #range(2017, 2023)
    
    df_pubs = pd.read_csv(dataDir_col + 'works_' + country_code + '_first_auth_' + str(year) + '_' + country_code_collab + '_col_proc2.csv')
    print(len(df_pubs))
    
    batch_size = 10000

    start = time.time()
    ind = 0
    for i in range(0, len(df_pubs)):
        if i == 0:
            start_batch = time.time()
            df_concepts_batch = pd.DataFrame()

        pub_id = df_pubs.iloc[i]['id']
        #Get concepts
        concepts_ = df_pubs.iloc[i]['concepts']
        concepts = ast.literal_eval(concepts_)
        for j in range(0, len(concepts)):
            ind+=1
            row = pd.DataFrame(data={
                'pub_id': pub_id,
                'concept': concepts[j]['display_name'],
                'concept_id': concepts[j]['id'],
                'concept_level': concepts[j]['level']
            }, index=[ind])
            df_concepts_batch = pd.concat([df_concepts_batch, row]) 

        if (np.remainder(i, batch_size) == 0) & (i>0):
            if i == batch_size:
                df_concepts_batch.to_csv(dataDir + 'concepts_by_pub_' + country_code + '_first_auth_' + str(year) + '.csv', index=False)
            else:
                df_concepts = pd.read_csv(dataDir + 'concepts_by_pub_' + country_code + '_first_auth_' + str(year) + '.csv')
                df_concepts = pd.concat([df_concepts, df_concepts_batch])
                df_concepts.to_csv(dataDir + 'concepts_by_pub_' + country_code + '_first_auth_' + str(year) + '.csv', index=False)
                print(str(len(df_concepts)) + ' rows saved.')
            end_batch = time.time()
            t_batch = end_batch - start_batch
            rate_batch = batch_size/t_batch
            t_remaining = round(((len(df_pubs) - i)/rate_batch)/60,0)
            print('Processing row ' + str(i) + '...at rate ' + str(round(rate_batch,0)) + ' rows/s...' + str(t_remaining) + ' min estimated remaining.')
            start_batch = time.time()
            df_concepts_batch = pd.DataFrame()

    end = time.time()   
    t = end - start
    print('Runtime: ' + str(round(t/60, 1)) + ' min')
    
    df_concepts = pd.read_csv(dataDir + 'concepts_by_pub_' + country_code + '_first_auth_' + str(year) + '.csv')
    df_concepts['concept'] = df_concepts['concept'].str.lower()
    df_concepts.to_csv(dataDir + 'concepts_by_pub_' + country_code + '_first_auth_' + str(year) + '.csv', index=False)

Combine yearly batches:

In [None]:
df_total = pd.DataFrame()
for file in os.listdir(dataDir):
    if 'concepts_by_pub_' + country_code + '_first_auth_' in file:
        #print(file)
        df_load = pd.read_csv(dataDir + file)
        print('Opening ' + file + '...rows: ' + str(len(df_load)))
        df_total = pd.concat([df_total, df_load])

df_total = df_total.sort_values('concept_level').reset_index(drop=True)
df_total.to_csv(dataDir + 'concepts_by_pub_' + country_code + '_first_auth_since2017.csv', index=False)