# Regional Profile II

#### Updated: Sep 27, 2022

#  

Generalize regional profile functions:
- to default to country-level when region not specified
- to show region-of-interest first author - CA collab, and CA first author - region of interest collab

And, combine search results by concept and title search. 

In [1]:
import pandas as pd
import numpy as np
import ast
import os

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
dataDir = '/Users/rnaidoo/Documents/Canada-Secure/GAC/2022_MDRID/Projects_data/OpenAlex/'

#  

#### Functions:

Define region:

In [4]:
def search_institutions(df_pubs, inst_search_terms=[]):
    
    if len(inst_search_terms) > 0 : 
        inst_search_results = []
        for search_term in inst_search_terms:
            inst_search_results = inst_search_results + list(df_pubs.loc[df_pubs['first_auth_inst'].str.contains(search_term)]['first_auth_inst'].unique())
        df_insts = pd.DataFrame({'inst_search': inst_search_results})
        df_pubs2 = df_pubs.merge(df_insts, how='inner', left_on='first_auth_inst', right_on='inst_search')
        
    else:
        inst_search_results = list(df_pubs['first_auth_inst'].unique())
        df_insts = pd.DataFrame({'inst_search': inst_search_results})
        df_pubs2 = df_pubs
        
    return [df_insts, df_pubs2]

Summarize region:

In [5]:
def top_institutions(df_pubs, reg_name, top=100, search_query=''):
    
    df_pubs['cited_by_count'] = pd.to_numeric(df_pubs['cited_by_count'])
    df_pubs['publications'] = 1
    
    df_top_inst = df_pubs[['cited_by_count', 'publications', 'first_auth_inst']].groupby(['first_auth_inst']).sum()
    df_top_inst = df_top_inst.sort_values('cited_by_count', ascending=False)
    df_top_inst = df_top_inst.replace(np.nan, '')
    df_top_inst = df_top_inst.rename(columns={
        'cited_by_count': 'by citations',
        'publications': 'by publications'
    })
    if len(df_top_inst) < top:
        top = len(df_top_inst)
    table_title = 'Top-' + str(top) + ' research institutions in ' + reg_name
    if len(search_query) > 0:
        table_title = table_title + " for '" + search_query + "'"
    df_top_inst.index.names = [table_title]
    
    return df_top_inst.head(top)

In [6]:
def top_researchers(df_pubs, reg_name, top=100, search_query=''):
    
    df_pubs['cited_by_count'] = pd.to_numeric(df_pubs['cited_by_count'])
    df_pubs['publications'] = 1
    
    df_top_ac = df_pubs[['cited_by_count', 'publications', 'first_author']].groupby(['first_author']).sum()
    df_top_ac = df_top_ac.sort_values('cited_by_count', ascending=False)
    df_top_ac2 = df_top_ac.merge(df_pubs[['first_author', 'first_auth_inst', 'first_auth_orcid']], how='left', on='first_author')
    df_top_ac2 = df_top_ac2.replace(np.nan, '')
    df_top_ac2 = df_top_ac2.rename(columns={
        'cited_by_count': 'by citations',
        'publications': 'by publications',
        'first_auth_inst': 'institution',
        'first_auth_orcid': 'orcid'
    })
    df_top_ac2 = df_top_ac2.drop_duplicates(subset=['orcid'])
    df_top_ac2 = df_top_ac2.drop_duplicates(subset=['first_author'])
    df_top_ac2 = df_top_ac2.set_index('first_author')
    if len(df_top_ac2) < top:
        top = len(df_top_ac2)
    table_title = 'Top-' + str(top) + ' academics in ' + reg_name
    if len(search_query) > 0:
        table_title = table_title + " for '" + search_query + "'"
    df_top_ac2.index.names = [table_title]
    
    return df_top_ac2.head(top)

Explore collaborations:

In [7]:
def proc_collab(df_in):
    
    for i in range(0, len(df_in)):
        collab_dict_ = df_in['collaborators_of_interest'].iloc[i]
        if collab_dict_ != '{}':
            collab_dict = ast.literal_eval(collab_dict_)
            if len(collab_dict) > 0:
                collab_str = ''
                for auth in collab_dict:
                    if collab_str != '':
                        collab_str += ', '
                    collab_str += auth + ' (' + collab_dict[auth]['col_auth_inst']
                    if collab_dict[auth]['col_auth_orcid'] == None:
                        collab_str += ')'
                    else: 
                        collab_str += ', ' + collab_dict[auth]['col_auth_orcid'] + ')'
                df_in['collaborators_of_interest'].iloc[i] = collab_str
    
    return df_in

In [8]:
def proc_collab_inv(df_in, reg_insts):
    for i in range(0, len(df_in)):
        collab_dict_ = df_in['collaborators_of_interest'].iloc[i]
        if collab_dict_ != '{}':
            collab_dict = ast.literal_eval(collab_dict_)
            if len(collab_dict) > 0:
                collab_str = ''
                for auth in collab_dict:
                    for reg_inst in reg_insts:
                        if collab_dict[auth]['col_auth_inst'] == reg_inst:
                            if collab_str != '':
                                collab_str += ', '
                            collab_str += auth + ' (' + collab_dict[auth]['col_auth_inst']
                            if collab_dict[auth]['col_auth_orcid'] == None:
                                collab_str += ')'
                            else: 
                                collab_str += ', ' + collab_dict[auth]['col_auth_orcid'] + ')'
                if collab_str == '':
                    collab_str = '{}'
                df_in['collaborators_of_interest'].iloc[i] = collab_str
    
    return df_in

In [9]:
def write_pubs_by_top_collab(df_result, reg_name, country_code_collab, dataDir_save, inv=False):
    
    df_result = df_result.replace(np.nan, '')
    df_result['collaborators_of_interest'] = df_result['collaborators_of_interest'].replace('{}', 'N')
    df_result = df_result.rename(columns={
        'first_auth_inst': 'Institution',
        'title': 'Publication Title',
        'publication_date': 'Publication Date',
        'cited_by_count': 'Citation Count',
        'first_auth_orcid': 'Academic Profile (ORCID)',
        'id': 'Publication Profile'
    })
    if inv:
        sheet_name_ = 'top-pubs with ' + reg_name + ' collab'
        writer = pd.ExcelWriter(dataDir_save + country_code_collab + '_works_' + reg_name + '_col.xlsx')
        df_result = df_result.rename(columns={
            'first_author': country_code_collab + ' Researcher',
            'collaborators_of_interest': reg_name + ' Collaborators'
        })
    else:        
        sheet_name_ = 'top-pubs with ' + country_code_collab + ' collab'
        writer = pd.ExcelWriter(dataDir_save + reg_name + '_works_' + country_code_collab + '_col.xlsx')
        df_result = df_result.rename(columns={
            'first_author': reg_name + ' Researcher',
            'collaborators_of_interest': country_code_collab + ' Collaborators'
        })    
    df_result.to_excel(writer, sheet_name=sheet_name_, index=False)
    
    # Auto-adjust columns' width
    for column in df_result:
        if column == 'Publication Title':
            column_width = 100
        elif column == country_code_collab + ' Collaborators':
            column_width = 75
        elif column == reg_name + ' Collaborators':
            column_width = 75
        else:
            column_width = max(df_result[column].astype(str).map(len).max(), len(column))
        col_idx = df_result.columns.get_loc(column)
        writer.sheets[sheet_name_].set_column(col_idx, col_idx, column_width)

    writer.save()

In [10]:
def top_collab_summary_table(df_col, df_pubs, top=100, inv=False, search_query=''):
    
    df_col['publications'] = 1
    df_top_ac = df_col[['cited_by_count', 'publications', 'first_author']].groupby(['first_author']).sum()
    df_top_ac = df_top_ac.sort_values('cited_by_count', ascending=False)
    df_top_ac2 = df_top_ac.merge(df_pubs[['first_author', 'first_auth_inst', 'first_auth_orcid']], how='left', on='first_author')
    df_top_ac2 = df_top_ac2.replace(np.nan, '')
    df_top_ac2 = df_top_ac2.rename(columns={
        'cited_by_count': 'by citations',
        'publications': 'by publications',
        'first_auth_inst': 'institution',
        'first_auth_orcid': 'orcid'
    })
    df_top_ac2 = df_top_ac2.drop_duplicates(subset=['orcid'])
    df_top_ac2 = df_top_ac2.drop_duplicates(subset=['first_author'])
    df_top_ac2 = df_top_ac2.set_index('first_author')
    if len(df_top_ac2) < top:
        top = len(df_top_ac2)
    if inv:
        table_title = 'Top-' + str(top) + ' academics in ' + country_code_collab + ' that collaborated with ' + reg_name + ' researchers'
        if len(search_query) > 0:
            table_title = table_title + ", on \'" + search_query + "'"
        df_top_ac2.index.names = [table_title]        
    else:
        table_title = 'Top-' + str(top) + ' academics in ' + reg_name + ' that collaborated with ' + country_code_collab + ' researchers'
        if len(search_query) > 0:
            table_title = table_title + ", on \'" + search_query + "'"
        df_top_ac2.index.names = [table_title]
            
    return df_top_ac2.head(top)

In [11]:
def top_collab(df_pubs, reg_name, country_code_collab, dataDir_save, top=100):
    
    df_col = df_pubs.loc[df_pubs['collaborators_of_interest'] != '{}']
    
    df_result = df_col[['first_author', 'first_auth_inst', 'title', 'publication_date', 'cited_by_count', 'collaborators_of_interest', 'first_auth_orcid', 'id']]
    df_result = df_result.sort_values(['cited_by_count', 'publication_date'], ascending=[False, False]).reset_index(drop=True)
    df_result = df_result.drop_duplicates(subset=['id'], keep='first')
    print(str(len(df_result)) + ' unique papers retrieved.')
    
    #Process collaborators of interest
    df_result = proc_collab(df_in=df_result)
    
    #Write results to Excel spreadsheet
    write_pubs_by_top_collab(df_result=df_result, reg_name=reg_name, country_code_collab=country_code_collab, dataDir_save=dataDir_save)
    
    #Produce summary table
    df_top = top_collab_summary_table(df_col=df_col, df_pubs=df_pubs, top=top, inv=False)
    
    return df_top

In [12]:
def top_collab_inv(df_pubs, reg_name, reg_insts, country_code_collab, dataDir_save, top=100): #reg_insts from df_insts
    
    df_col = df_pubs.loc[df_pubs['collaborators_of_interest'] != '{}']
    df_result = df_col[['first_author', 'first_auth_inst', 'title', 'publication_date', 'cited_by_count', 'collaborators_of_interest', 'first_auth_orcid', 'id']]
    df_result = df_result.sort_values(['cited_by_count', 'publication_date'], ascending=[False, False]).reset_index(drop=True)
    df_col = df_col.drop_duplicates(subset=['id'], keep='first')
    
    #Find the papers where researchers in the region of interest are collaborators
    reg_col_indicies = []
    for i in range(0, len(df_pubs)):
        collab_dict_ = df_pubs['collaborators_of_interest'].iloc[i]
        collab_dict = ast.literal_eval(collab_dict_)
        if len(collab_dict) > 0:
            for auth in collab_dict:
                for reg_inst in reg_insts:
                    if collab_dict[auth]['col_auth_inst'] == reg_inst:
                        reg_col_indicies.append(i) 
    df_col = df_pubs.iloc[reg_col_indicies]
    print(str(len(df_result)) + ' unique papers retrieved.')
    
    #Process collaborators of interest
    df_result = proc_collab_inv(df_in=df_result, reg_insts=reg_insts)
    
    #Write results to Excel spreadsheet
    write_pubs_by_top_collab(df_result=df_result, reg_name=reg_name, country_code_collab=country_code_collab, dataDir_save=dataDir_save, inv=True)    
    
    #Produce summary table
    df_top = top_collab_summary_table(df_col=df_col, df_pubs=df_pubs, top=top, inv=True)
    
    return df_top

Surface top publications by keywords:

In [13]:
def search_concepts(search_query, df_concepts):
    
    search_terms = search_query.split(' AND ')
    for i in range(0, len(search_terms)):
        search_terms[i] = search_terms[i].lower()
    print(search_terms)
    for i in range(0, len(search_terms)):
        if i == 0:
            df_search = df_concepts.loc[df_concepts['concept'].str.contains(search_terms[i])]
        else:
            df_search_ = df_concepts.loc[df_concepts['concept'].str.contains(search_terms[i])]
            df_search_ = df_search_.rename(columns={
                'concept': 'concept'+str(i+1),
                'concept_id': 'concept_id'+str(i+1),
                'concept_level': 'concept_level'+str(i+1)
            })
            df_search = df_search.merge(df_search_, how='inner', on='pub_id')
    #print(str(len(df_search)) + ' papers related to search...')
    
    return df_search

In [14]:
def search_title(search_query, df_pubs, case_sens=False):
    
    df_pubs = df_pubs.fillna('')
    search_terms = search_query.split(' AND ')
    if not case_sens:
        for i in range(0, len(search_terms)):
            search_terms[i] = search_terms[i].lower()
        df_pubs['title'] = df_pubs['title'].str.lower()
    print(search_terms)
    for i in range(0, len(search_terms)):
        if i == 0:
            df_search = df_pubs.loc[df_pubs['title'].str.contains(search_terms[i])]
        else:
            df_search = df_search.loc[df_search['title'].str.contains(search_terms[i])]
    #print(str(len(df_search)) + ' papers related to search...')
    
    return df_search

In [15]:
def search(search_query, df_concepts, df_pubs, keep_case=False, inv=False, reg_insts=[]):
    
    df_search_conc_ = search_concepts(search_query=search_query, df_concepts=df_concepts)
    df_search_conc = retrieve_papers_byConcept(search_query=search_query, df_search=df_search_conc_, df_pubs=df_pubs, inv=inv, reg_insts=reg_insts)
    
    df_search_titl_ = search_title(search_query=search_query, df_pubs=df_pubs, case_sens=False)
    df_search_titl = retrieve_papers_byTitle(search_query=search_query, df_search=df_search_titl_, inv=inv, reg_insts=reg_insts)
    
    df_search = pd.concat([df_search_conc, df_search_titl])
    df_search = df_search.drop_duplicates(subset=['id'], keep='first')
    print(str(len(df_search)) + ' papers retrieved...')
    df_search = df_search.sort_values(['cited_by_count', 'publication_date'], ascending=[False, False]).reset_index(drop=True)
    
    return df_search

In [16]:
def retrieve_papers_byConcept(search_query, df_search, df_pubs, inv=False, reg_insts=[]):
    
    df_merge = df_pubs.merge(df_search, left_on='id', right_on='pub_id')
    #print(str(len(df_merge)) + ' papers retrieved...')
    df_result = df_merge[['first_author', 'first_auth_inst', 'concept', 'concept_level', 'title', 'publication_date', 'cited_by_count', 'collaborators_of_interest', 'first_auth_orcid', 'id']]
    df_result = df_result.sort_values(['cited_by_count', 'publication_date', 'concept_level'], ascending=[False, False, False]).reset_index(drop=True)
    df_result = df_result.drop_duplicates(subset=['id'], keep='first')
    #print(str(len(df_result)) + ' unique papers retrieved.')
    df_result = df_result.drop(columns=['concept_level'])
    
    #Process collaborators of interest
    if inv:
        df_result = proc_collab_inv(df_in=df_result, reg_insts=reg_insts)
    else:
        df_result = proc_collab(df_in=df_result)
      
    return df_result        

In [17]:
def retrieve_papers_byTitle(search_query, df_search, inv=False, reg_insts=[]):
    
    #print(str(len(df_search)) + ' papers retrieved...')
    df_result = df_search[['first_author', 'first_auth_inst', 'title', 'publication_date', 'cited_by_count', 'collaborators_of_interest', 'first_auth_orcid', 'id']]
    df_result = df_result.sort_values(['cited_by_count', 'publication_date'], ascending=[False, False]).reset_index(drop=True)
    df_result = df_result.drop_duplicates(subset=['id'], keep='first')
    #print(str(len(df_result)) + ' unique papers retrieved.')
    
    #Process collaborators of interest
    if inv:
        df_result = proc_collab_inv(df_in=df_result, reg_insts=reg_insts)
    else:
        df_result = proc_collab(df_in=df_result)
                
    return df_result        

In [18]:
def write_search_result(search_query, df_result, reg_name, country_code_collab, dataDir_save, inv=False):
    
    sheet_name_ = search_query[0:30]
    df_result = df_result.replace(np.nan, '')
    df_result['collaborators_of_interest'] = df_result['collaborators_of_interest'].replace('{}', 'N')
    df_result = df_result.rename(columns={
        'first_auth_inst': 'Institution',
        'title': 'Publication Title',
        'publication_date': 'Publication Date',
        'cited_by_count': 'Citation Count',
        'first_auth_orcid': 'Academic Profile (ORCID)',
        'id': 'Publication Profile'
    })
    os.makedirs(dataDir_save + 'works_by_search/', exist_ok=True)
    if inv:
        writer = pd.ExcelWriter(dataDir_save + 'works_by_search/' + country_code_collab + '-' + reg_name + '_by_' + search_query + '.xlsx')
        df_result = df_result.rename(columns={
            'first_author': country_code_collab + ' Researcher',
            'collaborators_of_interest': reg_name + ' Collaborators'
        })
    else:
        writer = pd.ExcelWriter(dataDir_save + 'works_by_search/' + reg_name + '-' + country_code_collab + '_by_' + search_query + '.xlsx')
        df_result = df_result.rename(columns={
            'first_author': reg_name + ' Researcher',
            'collaborators_of_interest': country_code_collab + ' Collaborators'
        }) 
    df_result.to_excel(writer, sheet_name=sheet_name_, index=False)
    
    # Auto-adjust columns' width
    for column in df_result:
        if column == 'Publication Title':
            column_width = 100
        elif column == country_code_collab + ' Collaborators':
            column_width = 75
        elif column == reg_name + ' Collaborators':
            column_width = 75
        else:
            column_width = max(df_result[column].astype(str).map(len).max(), len(column))
        col_idx = df_result.columns.get_loc(column)
        writer.sheets[sheet_name_].set_column(col_idx, col_idx, column_width)
        
    writer.save()

#  

#### Load requisite data:

In [19]:
country_code = 'ES'
country_code_collab = 'CA'

In [20]:
df_pubs_A_first_auth = pd.read_csv(dataDir + 'works_' + country_code + '_first_auth/' + country_code_collab + '_col/' + 'works_' + country_code + '_first_auth_' + country_code_collab + '_col_since2017.csv')
print(len(df_pubs_A_first_auth))
df_pubs_A_first_auth.head()

458602


Unnamed: 0,id,doi,title,display_name,publication_year,publication_date,ids,host_venue,type,open_access,...,abstract_inverted_index,cited_by_api_url,counts_by_year,updated_date,created_date,first_author,first_auth_orcid,first_auth_inst,first_auth_country,collaborators_of_interest
0,https://openalex.org/W2963641747,,"Joan Marcet e Lucía Medina (eds.), La política...","Joan Marcet e Lucía Medina (eds.), La política...",2017,2017-01-01,{'openalex': 'https://openalex.org/W2963641747...,"{'id': None, 'issn_l': None, 'issn': None, 'di...",,"{'is_oa': False, 'oa_status': None, 'oa_url': ...",...,,https://api.openalex.org/works?filter=cites:W2...,[],2022-08-08T10:54:07.500056,2019-07-30,Steven Forti,https://orcid.org/0000-0002-7027-0220,Instituto de Historia,ES,{}
1,https://openalex.org/W2472444605,https://doi.org/10.1061/(asce)ei.1943-5541.000...,Developing Topographic Surveying Software to T...,Developing Topographic Surveying Software to T...,2017,2017-01-01,{'openalex': 'https://openalex.org/W2472444605...,"{'id': 'https://openalex.org/V170370859', 'iss...",journal-article,"{'is_oa': False, 'oa_status': 'closed', 'oa_ur...",...,"{'AbstractIn': [0], 'this': [1], 'study,': [2]...",https://api.openalex.org/works?filter=cites:W2...,"[{'year': 2022, 'cited_by_count': 1}, {'year':...",2022-08-09T14:16:07.295905,2016-07-22,Miguel Castro-García,https://orcid.org/0000-0003-4157-6164,University of Castilla-La Mancha,ES,{}
2,https://openalex.org/W2478043544,https://doi.org/10.1016/j.rpsm.2016.04.002,Inducción de hipocapnia e hiperoxia con maniob...,Inducción de hipocapnia e hiperoxia con maniob...,2017,2017-01-01,{'openalex': 'https://openalex.org/W2478043544...,"{'id': 'https://openalex.org/V2898614270', 'is...",journal-article,"{'is_oa': False, 'oa_status': 'closed', 'oa_ur...",...,"{'Resumen': [0], 'Introduccion': [3], 'La': [5...",https://api.openalex.org/works?filter=cites:W2...,"[{'year': 2021, 'cited_by_count': 1}, {'year':...",2022-08-22T06:24:56.670484,2016-08-23,Aida de Arriba-Arnau,https://orcid.org/0000-0002-7877-7341,University of Barcelona,ES,{}
3,https://openalex.org/W2482508491,https://doi.org/10.1016/j.spinee.2016.08.007,Preoperative and postoperative sagittal plane ...,Preoperative and postoperative sagittal plane ...,2017,2017-01-01,{'openalex': 'https://openalex.org/W2482508491...,"{'id': 'https://openalex.org/V112180307', 'iss...",journal-article,"{'is_oa': False, 'oa_status': 'closed', 'oa_ur...",...,"{'Abstract': [0], 'Background': [3], 'Context'...",https://api.openalex.org/works?filter=cites:W2...,"[{'year': 2021, 'cited_by_count': 2}, {'year':...",2022-06-24,2016-08-23,Felisa Sánchez-Mariscal,,Hospital Universitario de Getafe,ES,{}
4,https://openalex.org/W2484989076,https://doi.org/10.1007/978-3-319-09096-2_9,Looking Into the Profile of Music Audiences,Looking Into the Profile of Music Audiences,2017,2017-01-01,{'openalex': 'https://openalex.org/W2484989076...,"{'id': 'https://openalex.org/V3121261024', 'is...",book-chapter,"{'is_oa': False, 'oa_status': 'closed', 'oa_ur...",...,"{'The': [0], 'main': [1, 98], 'aims': [2], 'of...",https://api.openalex.org/works?filter=cites:W2...,"[{'year': 2022, 'cited_by_count': 1}, {'year':...",2022-08-08T13:09:36.373379,2016-08-23,Víctor Fernández-Blanco,https://orcid.org/0000-0003-2096-9460,University of Oviedo,ES,{}


In [21]:
df_pubs_B_first_auth = pd.read_csv(dataDir + 'works_' + country_code_collab + '_first_auth/' + country_code + '_col/' + 'works_' + country_code_collab + '_first_auth_' + country_code + '_col_since2017.csv')
print(len(df_pubs_B_first_auth))
df_pubs_B_first_auth.head()

506581


Unnamed: 0,id,doi,title,display_name,publication_year,publication_date,ids,host_venue,type,open_access,...,abstract_inverted_index,cited_by_api_url,counts_by_year,updated_date,created_date,first_author,first_auth_orcid,first_auth_inst,first_auth_country,collaborators_of_interest
0,https://openalex.org/W2582434976,https://doi.org/10.1007/978-1-4842-2598-1_12,Using Resource Quotas,Using Resource Quotas,2017,2017-01-01,{'openalex': 'https://openalex.org/W2582434976...,"{'id': None, 'issn_l': None, 'issn': None, 'di...",book-chapter,"{'is_oa': False, 'oa_status': 'closed', 'oa_ur...",...,"{'In': [0], 'Chapter': [1], '10': [2], 'we': [...",https://api.openalex.org/works?filter=cites:W2...,[],2022-06-30,2017-02-03,Deepak Vohra,,Peace Arch Hospital,CA,{}
1,https://openalex.org/W2598730797,https://doi.org/10.1109/iccnc.2017.7876167,An improved SDN-based fabric for flexible data...,An improved SDN-based fabric for flexible data...,2017,2017-01-01,{'openalex': 'https://openalex.org/W2598730797...,"{'id': None, 'issn_l': None, 'issn': None, 'di...",proceedings-article,"{'is_oa': False, 'oa_status': 'closed', 'oa_ur...",...,"{'Data': [0], 'centers': [1], 'play': [2], 'an...",https://api.openalex.org/works?filter=cites:W2...,"[{'year': 2020, 'cited_by_count': 1}]",2022-08-22T07:21:54.950801,2017-04-07,Wei Hou,,University of Ottawa,CA,{}
2,https://openalex.org/W2523535834,https://doi.org/10.1007/978-3-319-42304-3_1,Power-Shaping Configurable Microprocessors for...,Power-Shaping Configurable Microprocessors for...,2017,2017-01-01,{'openalex': 'https://openalex.org/W2523535834...,"{'id': None, 'issn_l': None, 'issn': None, 'di...",book-chapter,"{'is_oa': False, 'oa_status': 'closed', 'oa_ur...",...,"{'The': [0], '“Internet': [1], 'of': [2, 8, 52...",https://api.openalex.org/works?filter=cites:W2...,[],2022-08-18T08:44:40.666311,2016-09-30,Fabio Campi,,Simon Fraser University,CA,{}
3,https://openalex.org/W2940082676,,Les contours de l'organisation du travail soum...,Les contours de l'organisation du travail soum...,2017,2017-01-01,{'openalex': 'https://openalex.org/W2940082676...,"{'id': None, 'issn_l': None, 'issn': None, 'di...",,"{'is_oa': False, 'oa_status': None, 'oa_url': ...",...,,https://api.openalex.org/works?filter=cites:W2...,[],2022-08-07T09:01:08.519143,2019-04-25,Marc Nihoul,,Université Laval Faculty of Law,CA,{}
4,https://openalex.org/W2732588845,https://doi.org/10.1161/strokeaha.117.017622,In-Patient Code Stroke,In-Patient Code Stroke,2017,2017-01-01,{'openalex': 'https://openalex.org/W2732588845...,"{'id': 'https://openalex.org/V62532593', 'issn...",journal-article,"{'is_oa': True, 'oa_status': 'bronze', 'oa_url...",...,"{'Stroke': [0], 'is': [1], 'a': [2], 'relative...",https://api.openalex.org/works?filter=cites:W2...,"[{'year': 2022, 'cited_by_count': 5}, {'year':...",2022-08-30T10:39:20.384673,2017-07-14,Charles D. Kassardjian,https://orcid.org/0000-0001-7117-8787,University of Toronto,CA,{}


In [22]:
df_conc_A_first_auth = pd.read_csv(dataDir + 'works_' + country_code + '_first_auth/' + 'concepts_by_pub_' + country_code + '_first_auth_since2017.csv')
df_conc_A_first_auth

Unnamed: 0,pub_id,concept,concept_id,concept_level
0,https://openalex.org/W2963641747,political science,https://openalex.org/C17744445,0
1,https://openalex.org/W2784174820,political science,https://openalex.org/C17744445,0
2,https://openalex.org/W3042865676,sociology,https://openalex.org/C144024400,0
3,https://openalex.org/W2783557752,medicine,https://openalex.org/C71924100,0
4,https://openalex.org/W3042865676,psychology,https://openalex.org/C15744967,0
...,...,...,...,...
3282464,https://openalex.org/W3048970145,metapneumovirus,https://openalex.org/C2911218186,5
3282465,https://openalex.org/W3112118800,cell-free fetal dna,https://openalex.org/C152110520,5
3282466,https://openalex.org/W3083927243,transimpedance amplifier,https://openalex.org/C92631468,5
3282467,https://openalex.org/W2990774577,autotransformer,https://openalex.org/C22958824,5


In [23]:
df_conc_B_first_auth = pd.read_csv(dataDir + 'works_' + country_code_collab + '_first_auth/' + 'concepts_by_pub_' + country_code_collab + '_first_auth_since2017.csv')
df_conc_B_first_auth

Unnamed: 0,pub_id,concept,concept_id,concept_level
0,https://openalex.org/W3039684518,sociology,https://openalex.org/C144024400,0
1,https://openalex.org/W2807818210,medicine,https://openalex.org/C71924100,0
2,https://openalex.org/W2794919000,medicine,https://openalex.org/C71924100,0
3,https://openalex.org/W2807748811,medicine,https://openalex.org/C71924100,0
4,https://openalex.org/W3132079119,business,https://openalex.org/C144133560,0
...,...,...,...,...
3899125,https://openalex.org/W2914470466,audit evidence,https://openalex.org/C156223087,5
3899126,https://openalex.org/W3083331412,severe acute respiratory syndrome coronavirus ...,https://openalex.org/C3007834351,5
3899127,https://openalex.org/W4238312086,rna polymerase ii,https://openalex.org/C64350747,5
3899128,https://openalex.org/W2943641736,ice tongue,https://openalex.org/C24474704,5


#  

#### Specify a region of interest (if necessary):

In [24]:
reg_name = 'IE University'

if reg_name != '':
    dataDir_reg = dataDir+'works_' + country_code + '_first_auth/' + country_code_collab + '_col/Regional/' + reg_name + '/'
    os.makedirs(dataDir_reg, exist_ok=True)
else:
    dataDir_reg = dataDir+'works_' + country_code + '_first_auth/' + country_code_collab + '_col/'
    reg_name = country_code

Search for institutions:

In [25]:
inst_search_terms = [    
    'IE'
    #Madrid
    
    #Barcelona
    #'Barc',
    #'Cata'
    
    #Scotland
    #'Scot',
    #'St Andrews',
    #'Edin',
    #'Glasg',
    #'Aberdeen',
    #'Strathclyde',
    #'Dundee'
]

df_insts, df_pubs_reg = search_institutions(df_pubs=df_pubs_A_first_auth, inst_search_terms=inst_search_terms)
reg_insts = df_insts['inst_search'].unique()
reg_insts

array(['IE University'], dtype=object)

In [26]:
len(reg_insts)

1

#### What are the top institutions in the region?

In [27]:
top_institutions(df_pubs=df_pubs_reg, reg_name=reg_name, top=10)

Unnamed: 0_level_0,by citations,by publications
Top-10 research institutions in ES,Unnamed: 1_level_1,Unnamed: 2_level_1
University of Barcelona,101716,14966
University of Granada,78646,13679
Autonomous University of Barcelona,77377,13940
Complutense University of Madrid,75243,16730
University of the Basque Country,71251,10999
University of Valencia,66985,13342
University of Seville,60228,12576
Autonomous University of Madrid,57436,10015
Universitat Politècnica de València,48741,9323
Universitat Politècnica de Catalunya,45785,7894


#### Who are the top researchers in the region, overall?

In [28]:
top_researchers(df_pubs=df_pubs_reg, reg_name=reg_name, top=10)

Unnamed: 0_level_0,by citations,by publications,institution,orcid
Top-10 academics in ES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alfonso J. Cruz-Jentoft,4045,13,Hospital Universitario Ramón y Cajal,https://orcid.org/0000-0001-7628-4861
Luis Paz-Ares,3503,50,Centro Nacional de Investigaciones Cardiovascu...,https://orcid.org/0000-0002-1947-3364
Lichen Liu,3277,19,Instituto de Tecnología Química,https://orcid.org/0000-0001-5067-0481
Daniele V.F. Tauriello,2709,5,Institute for Research in Biomedicine,https://orcid.org/0000-0003-1522-3496
Jordi Bruix,2456,12,Hospital Clínic de Barcelona,https://orcid.org/0000-0002-9826-0753
Julio Rozas,2149,1,University of Barcelona,https://orcid.org/0000-0002-6839-9148
Heng Zhang,1974,18,CIC energigune,https://orcid.org/0000-0002-8811-6336
Jeffrey V. Lazarus,1942,45,Barcelona Institute for Global Health,https://orcid.org/0000-0001-9618-2299
Eduard Batlle,1914,2,Institució Catalana de Recerca i Estudis Avançats,https://orcid.org/0000-0003-2422-0326
Joan B. Soriano,1766,35,Autonomous University of Madrid,https://orcid.org/0000-0001-9740-2994


#### Who are the top collaborators between the region of interest and the other country of interest, overall?

In [29]:
top_collab(df_pubs=df_pubs_reg, reg_name=reg_name, country_code_collab=country_code_collab, dataDir_save=dataDir_reg, top=10)

3740 unique papers retrieved.


Unnamed: 0_level_0,by citations,by publications,institution,orcid
Top-10 academics in ES that collaborated with CA researchers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Luis Paz-Ares,2197,8,Centro Nacional de Investigaciones Cardiovascu...,https://orcid.org/0000-0002-1947-3364
Xavier Montalban,928,2,Vall d'Hebron Hospital Universitari,https://orcid.org/0000-0002-0098-9918
Unai Pascual,764,2,University of the Basque Country,https://orcid.org/0000-0002-5696-236X
Antonio González-Martín,674,3,Clinica Universidad de Navarra,https://orcid.org/0000-0001-8376-9576
Aurelio Tobias,431,1,Institute of Environmental Assessment and Wate...,https://orcid.org/0000-0001-6428-6755
Joan B. Soriano,378,2,Autonomous University of Madrid,https://orcid.org/0000-0001-9740-2994
Elena Sánchez-López,344,1,University of Barcelona,https://orcid.org/0000-0001-8546-766X
Miguel Martin,323,1,Hospital General Universitario Gregorio Marañón,https://orcid.org/0000-0001-6156-0739
Carol Moreno,310,2,Autonomous University of Barcelona,https://orcid.org/0000-0003-3275-0271
Julián Panés,262,4,Centro de Investigación Biomédica en Red de En...,https://orcid.org/0000-0002-4971-6902


In [30]:
top_collab_inv(df_pubs=df_pubs_B_first_auth, reg_name=reg_name, reg_insts=reg_insts, country_code_collab=country_code_collab, dataDir_save=dataDir_reg, top=10)

3504 unique papers retrieved.


Unnamed: 0_level_0,by citations,by publications,institution,orcid
Top-10 academics in CA that collaborated with ES researchers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gregory B. Gloor,1990,2,Western University,https://orcid.org/0000-0001-5803-3380
Yash Patel,1280,46,Holland Bloorview Kids Rehabilitation Hospital,
Josep Rodés-Cabau,1238,19,Université Laval,https://orcid.org/0000-0001-8237-7095
Deborah J. Cook,1186,2,McMaster University,https://orcid.org/0000-0002-4087-543X
Jacob Biamonte,1169,1,University of Waterloo,https://orcid.org/0000-0002-0590-3327
Sina M. Adl,1106,2,University of Saskatchewan,https://orcid.org/0000-0001-6324-6065
Jean-Claude Tardif,1083,2,Montreal Heart Institute,https://orcid.org/0000-0002-8200-8983
Deepali Kumar,999,10,University Health Network,https://orcid.org/0000-0003-1961-0477
Denice S. Feig,943,5,University of Toronto,https://orcid.org/0000-0001-8561-7584
Jan Brozek,753,3,McMaster University,https://orcid.org/0000-0002-3122-0773


#  

#### Search for the top researchers and collaborations, for a particular subject by keywords:

In [32]:
search_query = 'marketing'

Region of interest from country A first author, country B collaborator:

In [33]:
df_search = search(search_query=search_query, df_concepts=df_conc_A_first_auth, df_pubs=df_pubs_reg)

['marketing']
['marketing']
50 papers retrieved...


In [34]:
df_search.head()

Unnamed: 0,first_author,first_auth_inst,concept,title,publication_date,cited_by_count,collaborators_of_interest,first_auth_orcid,id
0,Mikko Ketokivi,IE University,marketing,Why locate manufacturing in a high-cost countr...,2017-03-01,66,{},https://orcid.org/0000-0003-4510-4949,https://openalex.org/W2580589104
1,Elena Revilla,IE University,marketing,Bulding ambidexterity through creativity mecha...,2018-11-01,28,{},https://orcid.org/0000-0002-9901-8707,https://openalex.org/W2805812250
2,Stefanie Beninger,IE University,marketing,Collective market shaping by competitors and i...,2021-01-01,16,"June N.P. Francis (Simon Fraser University, ht...",https://orcid.org/0000-0002-6956-7625,https://openalex.org/W3087471450
3,Rosario Silva,IE University,marketing,Corporate brand and hotel performance: A resou...,2017-10-01,13,{},,https://openalex.org/W2620514704
4,Serghei Musaji,IE University,marketing,How Long Does It Take to Get to the Learning C...,2020-02-13,11,{},,https://openalex.org/W2913295851


In [35]:
write_search_result(search_query=search_query, df_result=df_search, reg_name=reg_name, country_code_collab=country_code_collab, dataDir_save=dataDir_reg, inv=False)

In [36]:
top_institutions(df_pubs=df_search, reg_name=reg_name, top=10, search_query=search_query)

Unnamed: 0_level_0,by citations,by publications
Top-10 research institutions in ES for 'quantum comp',Unnamed: 1_level_1,Unnamed: 2_level_1
Institute of Photonic Sciences,533,15
University of the Basque Country,376,24
Institute for High Energy Physics,350,1
University of Valencia,316,7
Universidade de Vigo,277,4
Barcelona Supercomputing Center,219,7
Donostia International Physics Center,167,3
Autonomous University of Madrid,143,8
University of A Coruña,115,7
Technical University of Madrid,75,19


In [37]:
top_researchers(df_pubs=df_search, reg_name=reg_name, top=10, search_query=search_query)

Unnamed: 0_level_0,by citations,by publications,institution,orcid
Top-10 academics in ES for 'quantum comp',Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
P. Forn-Díaz,378,2,Institute for High Energy Physics,https://orcid.org/0000-0003-4365-5157
Alejandro Gaita-Ariño,279,1,University of Valencia,https://orcid.org/0000-0002-1600-8627
Miguel Herrero-Collantes,271,1,Universidade de Vigo,
Antonio Acín,245,1,Institute of Photonic Sciences,https://orcid.org/0000-0002-1355-3435
Roman Orus,167,3,Donostia International Physics Center,https://orcid.org/0000-0002-4496-8115
Lucas Lamata,158,6,University of the Basque Country,https://orcid.org/0000-0002-9504-8685
Tiago M. Fernández-Caramés,110,2,University of A Coruña,https://orcid.org/0000-0003-2179-5917
Elsa Prada,109,1,Autonomous University of Madrid,https://orcid.org/0000-0001-7522-4795
Adrián Pérez-Salinas,98,2,Barcelona Supercomputing Center,https://orcid.org/0000-0001-5430-6468
Nicolas Maring,80,1,Institute of Photonic Sciences,https://orcid.org/0000-0001-7915-9986


In [38]:
df_col = df_search.loc[df_search['collaborators_of_interest'] != '{}']
top_collab_summary_table(df_col=df_col, df_pubs=df_search, search_query=search_query, top=10, inv=False)

Unnamed: 0_level_0,by citations,by publications,institution,orcid
"Top-5 academics in ES that collaborated with CA researchers, on 'quantum comp'",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ding Liu,49,1,Institute of Photonic Sciences,https://orcid.org/0000-0001-7688-8200
P. Forn-Díaz,28,1,Institute for High Energy Physics,https://orcid.org/0000-0003-4365-5157
Juan Carlos Garcia-Escartin,5,1,University of Valladolid,https://orcid.org/0000-0002-9813-5004
M. Mochol-Grzelak,4,1,Institute of Photonic Sciences,
Lian-Ao Wu,1,1,University of the Basque Country,https://orcid.org/0000-0003-4896-6958


Country B first author, region of interest from Country A collaborator:

In [39]:
df_search = search(search_query=search_query, df_concepts=df_conc_B_first_auth, df_pubs=df_pubs_B_first_auth, inv=True, reg_insts=reg_insts)

['quantum comp']
['quantum comp']
305 papers retrieved...


In [40]:
write_search_result(search_query=search_query, df_result=df_search, reg_name=reg_name, country_code_collab=country_code_collab, dataDir_save=dataDir_reg, inv=True)

In [41]:
df_col = df_search.loc[df_search['collaborators_of_interest'] != '{}']
top_collab_summary_table(df_col=df_col, df_pubs=df_search, search_query=search_query, top=10, inv=True)

Unnamed: 0_level_0,by citations,by publications,institution,orcid
"Top-7 academics in CA that collaborated with ES researchers, on 'quantum comp'",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jacob Biamonte,1169,1,University of Waterloo,https://orcid.org/0000-0002-0590-3327
Scott Ryall,28,1,Hospital for Sick Children,https://orcid.org/0000-0002-5327-3892
Samuel Boutin,25,1,Université de Sherbrooke,https://orcid.org/0000-0003-3819-6830
Danny Paulson,19,1,University of Waterloo,https://orcid.org/0000-0003-2522-4046
Angus Lowe,10,1,University of Waterloo,
Javad Pourahmadazar,4,1,Institut National de la Recherche Scientifique,https://orcid.org/0000-0002-2121-6965
Zi-Wen Liu,2,1,Perimeter Institute,https://orcid.org/0000-0002-3402-9763
