In [1]:
import pandas as pd
import pyterrier as pt
import os
import requests
import json
import re
import csv
import numpy as np

In [2]:
if not pt.started():
  pt.init()

PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
dataset = pt.datasets.get_dataset('irds:cord19/trec-covid')
pt_index_path = './indices/cord19'

if not os.path.exists(pt_index_path + "/data.properties"):
  indexer = pt.index.IterDictIndexer(pt_index_path, blocks = True)
  index_ref = indexer.index(dataset.get_corpus_iter(),
                            fields = ['title', 'doi', 'abstract'],
                            meta = ('docno',))
else:
    index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")

index = pt.IndexFactory.of(index_ref)

In [4]:
metadata = pd.read_csv('~/.ir_datasets/cord19/2020-07-16/metadata.csv')
splitted_authors = pd.read_csv('./split_authors.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
len(metadata)

192509

In [6]:
metadata.head(5)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636.0,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967.0,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972.0,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871.0,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,PMC,Gene expression in epithelial cells in respons...,10.1186/rr61,PMC59580,11686888.0,no-cc,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,,,,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,


Dropping empty autors

In [7]:
metadata = metadata[metadata['authors'].notna()]
metadata = metadata.reset_index()
metadata = metadata.drop(columns = 'index')

In [8]:
len(metadata)

186032

## Splitting the authors into single columns

A function to split the author dict up into single columns. To make the retrieval process possible, commas, double spaces and other characters had to be removed without using a complex regex solution (in case a person hat a accent in their name for example.

In [9]:
def split_authors(n):
    # get first n articles
    author_uid_df = metadata[['authors', 'cord_uid']][:n]
    author_uid_df['authors'] = author_uid_df['authors'].apply(pd.Series)
    # stringsplit authors
    splitted_authors_df = author_uid_df['authors'].str.split(';', expand=True)
    #splitted_authors_df = splitted_authors_df.replace(',','', regex=True)
    # concatenate the splitted authors to our dataframe
    author_uid_df = pd.concat([author_uid_df, splitted_authors_df], axis=1)
    # drop the authors (column with multiple authors)
    author_uid_df.drop(columns='authors')
    # replace the empty column of multiply authors
    #author_title_df = author_title_df.replace('', np.nan).set_index('title')
    author_uid_df = author_uid_df.drop(columns='authors')
    #author_title_df = author_title_df.reset_index()
    author_uid_df = author_uid_df.replace('', np.nan).set_index('cord_uid').stack().reset_index(name='author').drop('level_1',1)
    author_uid_df['author'] = author_uid_df['author'].str.split(',').str[::-1].str.join(' ')
    author_uid_df['author'] = author_uid_df['author'].str.replace('  ', ' ')
    author_uid_df['author'] = author_uid_df['author'].str.lstrip()
    merged_data = metadata[['cord_uid', 'title', 'license', 'publish_time', 'journal']]
    merged_data = author_uid_df.merge(merged_data, on = 'cord_uid')
    return merged_data

### How to work with the API:

Basic request:
https://api.openalex.org/ + work/authors/venues/insitutions/concecpts + ?filter = + columns.search + Name

In [10]:
tariq = requests.get(
    'https://api.openalex.org/authors?filter=display_name.search:Tariq'
).json()['results'][0] 

tariq                      

{'id': 'https://openalex.org/A2993625842',
 'orcid': 'https://orcid.org/0000-0001-7596-4697',
 'display_name': 'Tariq Ahmad',
 'display_name_alternatives': [],
 'relevance_score': 5402.8687,
 'works_count': 417,
 'cited_by_count': 29549,
 'ids': {'openalex': 'https://openalex.org/A2993625842',
  'orcid': 'https://orcid.org/0000-0001-7596-4697',
  'mag': '2993625842'},
 'last_known_institution': {'id': 'https://openalex.org/I32971472',
  'ror': 'https://ror.org/03v76x132',
  'display_name': 'Yale University',
  'country_code': 'US',
  'type': 'education'},
 'x_concepts': [{'id': 'https://openalex.org/C71924100',
   'wikidata': 'https://www.wikidata.org/wiki/Q11190',
   'display_name': 'Medicine',
   'level': 0,
   'score': 105.5},
  {'id': 'https://openalex.org/C126322002',
   'wikidata': 'https://www.wikidata.org/wiki/Q11180',
   'display_name': 'Internal medicine',
   'level': 1,
   'score': 99.7},
  {'id': 'https://openalex.org/C86803240',
   'wikidata': 'https://www.wikidata.org/wik

## Retrieving author information

A function to retrieve all the authors from the metadata's first n documents. It creates a list to call OpenAlex with.

In [11]:
def getAuthors(n_documents):
    author_list = []
    for i in range(n_documents):
        print(metadata['authors'][i])
        if(metadata['authors'][i] != "nan"):#isinstance(i,float)):
            authors_split = metadata['authors'][i].split(";")
            for j in range(len(authors_split)):
                authors_split[j] = re.sub(r'\W+', ' ', authors_split[j])
                #authors_split[j] = authors_split[j].replace('-',' ')
                authors_split[j] = authors_split[j].lstrip()
                author_list.append(authors_split[j])
        else:
            author_list.append("")
    return author_list

In [12]:
authors = getAuthors(50)

Madani, Tariq A; Al-Ghamdi, Aisha A
Vliet, Albert van der; Eiserich, Jason P; Cross, Carroll E
Crouch, Erika C
Fagan, Karen A; McMurtry, Ivan F; Rodman, David M
Domachowske, Joseph B; Bonville, Cynthia A; Rosenberg, Helene F
Pasternak, Alexander O.; van den Born, Erwin; Spaan, Willy J.M.; Snijder, Eric J.
Alvarez, Gonzalo; Hébert, Paul C; Szick, Sharyn
Ball, Jonathan; Venn, Richard
Slebos, Dirk-Jan; Ryter, Stefan W; Choi, Augustine MK
Tsui, Fu-Chiang; Espino, Jeremy U.; Dato, Virginia M.; Gesteland, Per H.; Hutman, Judith; Wagner, Michael M.
Ivanov, Ivaylo P.; Matsufuji, Senya; Murakami, Yasuko; Gesteland, Raymond F.; Atkins, John F.
Shi, Stephanie T.; Huang, Peiyong; Li, Hsin-Pai; Lai, Michael M.C.
Pridgeon, Julia W.; Geetha, Thangiah; Wooten, Marie W.
Ploubidou, Aspasia; Moreau, Violaine; Ashman, Keith; Reckmann, Inge; González, Cayetano; Way, Michael
Barry, John M
Shieh, Biehuoy; Li, Ching
Verheij, Joanne; Groeneveld, AB Johan; Beishuizen, Albertus; Lingen, Arthur van; Simoons-Smit,

Retrieving the data using OpenAlex. If the author is not found, Empty Name is used as dummy.

In [13]:
len(authors)

230

In [14]:
author_df = pd.DataFrame()

for i in range(len(authors)):
    try:
        print('Retrieved:',authors[i])
        author_information = requests.get(
            'https://api.openalex.org/authors?filter=display_name.search:'+authors[i]
        ).json()['results'][0]
        current_author = pd.DataFrame.from_dict(author_information, orient='index')
        current_author = current_author.transpose()
        author_df = author_df.append(current_author)
    except:
        print("Empty name")
    #author_data = author_data.append(pd.json_normalize(author_information)[['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution.display_name', 'x_concepts']])
    #author_data[i] = author_information['x_concepts'][1]['display_name']
    
    #author_data.append(pd.json_normalize(tariq)[['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution.display_name', 'x_concepts']])


Retrieved: Madani Tariq A
Retrieved: Al Ghamdi Aisha A
Retrieved: Vliet Albert van der
Retrieved: Eiserich Jason P
Retrieved: Cross Carroll E
Retrieved: Crouch Erika C
Retrieved: Fagan Karen A
Retrieved: McMurtry Ivan F
Retrieved: Rodman David M
Retrieved: Domachowske Joseph B
Retrieved: Bonville Cynthia A
Retrieved: Rosenberg Helene F
Retrieved: Pasternak Alexander O 
Retrieved: van den Born Erwin
Retrieved: Spaan Willy J M 
Retrieved: Snijder Eric J 
Retrieved: Alvarez Gonzalo
Retrieved: Hébert Paul C
Retrieved: Szick Sharyn
Retrieved: Ball Jonathan
Retrieved: Venn Richard
Retrieved: Slebos Dirk Jan
Retrieved: Ryter Stefan W
Retrieved: Choi Augustine MK
Retrieved: Tsui Fu Chiang
Retrieved: Espino Jeremy U 
Retrieved: Dato Virginia M 
Retrieved: Gesteland Per H 
Retrieved: Hutman Judith
Retrieved: Wagner Michael M 
Retrieved: Ivanov Ivaylo P 
Retrieved: Matsufuji Senya
Retrieved: Murakami Yasuko
Retrieved: Gesteland Raymond F 
Retrieved: Atkins John F 
Retrieved: Shi Stephanie T 
Retr

In [15]:
author_df

Unnamed: 0,id,orcid,display_name,display_name_alternatives,relevance_score,works_count,cited_by_count,ids,last_known_institution,x_concepts,counts_by_year,works_api_url,updated_date,created_date
0,https://openalex.org/A2000431824,https://orcid.org/0000-0003-2453-0623,Tariq A. Madani,[Tariq A. Madani],1559.0482,88,5096,{'openalex': 'https://openalex.org/A2000431824...,"{'id': 'https://openalex.org/I185163786', 'ror...","[{'id': 'https://openalex.org/C71924100', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-19T07:27:46.277091,2016-06-24
0,https://openalex.org/A2181556405,https://orcid.org/0000-0001-8106-0424,Aisha A. Al-Ghamdi,[],256.78207,23,102,{'openalex': 'https://openalex.org/A2181556405...,"{'id': 'https://openalex.org/I185163786', 'ror...","[{'id': 'https://openalex.org/C71924100', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-11-23T15:53:37.889422,2016-06-24
0,https://openalex.org/A2134341530,https://orcid.org/0000-0003-0923-0016,Albert van der Vliet,[],2384.7302,190,9193,{'openalex': 'https://openalex.org/A2134341530...,"{'id': 'https://openalex.org/I111236770', 'ror...","[{'id': 'https://openalex.org/C86803240', 'wik...","[{'year': 2022, 'works_count': 1, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-14T10:26:18.518355,2016-06-24
0,https://openalex.org/A62037741,,Jason P. Eiserich,[],2202.5547,99,7941,"{'openalex': 'https://openalex.org/A62037741',...","{'id': 'https://openalex.org/I84218800', 'ror'...","[{'id': 'https://openalex.org/C86803240', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-17T11:07:38.562541,2016-06-24
0,https://openalex.org/A2098199652,,Carroll E. Cross,[],2456.3755,300,14160,{'openalex': 'https://openalex.org/A2098199652...,"{'id': 'https://openalex.org/I84218800', 'ror'...","[{'id': 'https://openalex.org/C86803240', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-19T00:18:51.300564,2016-06-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,https://openalex.org/A2059785448,,Cécile M. Bensimon,[],486.1169,36,460,{'openalex': 'https://openalex.org/A2059785448...,"{'id': 'https://openalex.org/I185261750', 'ror...","[{'id': 'https://openalex.org/C17744445', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-06T09:29:28.765293,2016-06-24
0,https://openalex.org/A2345666091,https://orcid.org/0000-0003-4630-0108,Mark Bernstein,[],1778.5278,392,10842,{'openalex': 'https://openalex.org/A2345666091...,"{'id': 'https://openalex.org/I1325899441', 'ro...","[{'id': 'https://openalex.org/C71924100', 'wik...","[{'year': 2022, 'works_count': 4, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-18T14:18:29.873852,2016-06-24
0,https://openalex.org/A143159219,,Laura Hawryluck,[],1502.3676,55,4093,{'openalex': 'https://openalex.org/A143159219'...,"{'id': 'https://openalex.org/I1325899441', 'ro...","[{'id': 'https://openalex.org/C71924100', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-18T23:45:17.621425,2016-06-24
0,https://openalex.org/A2063148044,,Randi Zlotnik Shaul,[],969.5727,53,916,{'openalex': 'https://openalex.org/A2063148044...,"{'id': 'https://openalex.org/I2801317318', 'ro...","[{'id': 'https://openalex.org/C71924100', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-03T11:30:41.928514,2016-06-24


Cleaning up the author data: 

1. Defining a dataframe
2. add the extracted information into our dataframe
3. replace special characters in the display name
4. add the x_concepts (most common research field of the author according to openalex)

In [16]:
author_data = pd.DataFrame(columns= [['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution', 'x_concepts']])
author_data = author_df[['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution', 'x_concepts']]

author_data['research_field'] = ""
author_data = author_data.reset_index()
author_data = author_data.drop(columns = 'index')
#author_data['counter'] = range(len(author_data))
#author_data = author_data.drop(columns='index')

#clean display names again

for i in range(len(author_data)):
    author_data['display_name'][i] = re.sub(r'\W+', ' ', author_data['display_name'][i])
    #authors_split[j] = authors_split[j].replace('-',' ')
    author_data['display_name'][i] = author_data['display_name'][i].lstrip()
author_data.display_name = author_data.display_name.str.replace('[^a-zA-Z]', ' ')

for i in range(len(author_data)):
    try:
        author_data.iloc[i, author_data.columns.get_loc('research_field')] = author_data.iloc[i]['x_concepts'][0]['display_name']
    except:
        author_data.iloc[i, author_data.columns.get_loc('research_field')] = "Unknown"
    try:
        author_data.iloc[i, author_data.columns.get_loc('last_known_institution')] = (list(author_data['last_known_institution'])[i]['display_name'])    
    except:
        author_data.iloc[i, author_data.columns.get_loc('last_known_institution')] = "Unknown"
author_data = author_data.drop(columns = ['x_concepts']) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  author_data['research_field'] = ""
  author_data.display_name = author_data.display_name.str.replace('[^a-zA-Z]', ' ')


In [17]:
author_data

Unnamed: 0,display_name,relevance_score,works_count,cited_by_count,last_known_institution,research_field
0,Tariq A Madani,1559.0482,88,5096,King Abdulaziz University,Medicine
1,Aisha A Al Ghamdi,256.78207,23,102,King Abdulaziz University,Medicine
2,Albert van der Vliet,2384.7302,190,9193,University of Vermont,Biology
3,Jason P Eiserich,2202.5547,99,7941,"University of California, Davis",Biology
4,Carroll E Cross,2456.3755,300,14160,"University of California, Davis",Biology
...,...,...,...,...,...,...
219,C cile M Bensimon,486.1169,36,460,University of Toronto,Political science
220,Mark Bernstein,1778.5278,392,10842,University Health Network,Medicine
221,Laura Hawryluck,1502.3676,55,4093,University Health Network,Medicine
222,Randi Zlotnik Shaul,969.5727,53,916,Hospital for Sick Children,Medicine


## Retrieving Insitution Information

Create the institution list from the author data

In [18]:
institutions_list = author_data['last_known_institution'].unique()

Replace commas and spaces at the start to retrieve the documents.

In [19]:
for i in range(len(institutions_list)):
    institutions_list[i] = institutions_list[i].replace(',','')
    institutions_list[i] = institutions_list[i].lstrip()
    print(institutions_list[i])
    #authors_split[j].replace(',','')
     #       authors_split[j] = authors_split[j].lstrip()
      #      author_list.append(authors_split[j])

King Abdulaziz University
University of Vermont
University of California Davis
Washington University in St. Louis
University of South Alabama
ProQR Therapeutics
SUNY Upstate Medical University
National Institute of Allergy and Infectious Diseases
University of Amsterdam
MSD
Leiden University Medical Center
Instituto de Desarrollo Tecnológico para la Industria Química
University of Montreal
Ottawa Hospital
University of Nottingham
Unknown
University Medical Center Groningen
Cornell College
Children's Hospital of Philadelphia
University of Pittsburgh
University of Utah
Medical University of Sofia
Jikei University School of Medicine
Howard Hughes Medical Institute
University College Cork
Pfizer
Chang Gung University
Institute of Molecular Biology Academia Sinica
Agricultural Research Service
Auburn University
European Molecular Biology Laboratory
Inserm
University of Queensland
The Francis Crick Institute
Oregon Medical Research Center
University of Colorado Denver
Bloomberg
VU Amsterdam


Create a dataframe and retrieve the institutions using OpenAlex.

In [20]:
institution_df = pd.DataFrame()

for i in range(len(institutions_list)):
    try:
        print('Retrieved:',institutions_list[i])
        institution_information = requests.get(
            'https://api.openalex.org/institutions?filter=display_name.search:'+institutions_list[i]
        ).json()['results'][0]
        current_institution = pd.DataFrame.from_dict(institution_information, orient='index')
        current_institution = current_institution.transpose()
        institution_df = institution_df.append(current_institution)
    except:
        print("Empty name")
    #author_data = author_data.append(pd.json_normalize(author_information)[['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution.display_name', 'x_concepts']])
    #author_data[i] = author_information['x_concepts'][1]['display_name']
    
    #author_data.append(pd.json_normalize(tariq)[['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution.display_name', 'x_concepts']])


Retrieved: King Abdulaziz University
Retrieved: University of Vermont
Retrieved: University of California Davis
Retrieved: Washington University in St. Louis
Retrieved: University of South Alabama
Retrieved: ProQR Therapeutics
Retrieved: SUNY Upstate Medical University
Retrieved: National Institute of Allergy and Infectious Diseases
Retrieved: University of Amsterdam
Retrieved: MSD
Retrieved: Leiden University Medical Center
Retrieved: Instituto de Desarrollo Tecnológico para la Industria Química
Retrieved: University of Montreal
Retrieved: Ottawa Hospital
Retrieved: University of Nottingham
Retrieved: Unknown
Empty name
Retrieved: University Medical Center Groningen
Retrieved: Cornell College
Retrieved: Children's Hospital of Philadelphia
Retrieved: University of Pittsburgh
Retrieved: University of Utah
Retrieved: Medical University of Sofia
Retrieved: Jikei University School of Medicine
Retrieved: Howard Hughes Medical Institute
Retrieved: University College Cork
Retrieved: Pfizer
Re

Preprocess the institution data by:

1. renaming
2. find out the main research field
3. replacing special characters

In [21]:
institution_data = institution_df[['display_name', 'relevance_score', 'country_code', 'type', 'cited_by_count', 'x_concepts']]
institution_data = institution_data.rename(columns= {'display_name' : 'institution_name', 'relevance_score': 'institution_relevance_score', 'cited_by_count': 'institution_cited_by_count', 'x_concepts' : 'institution_main_research_field'})

# gain knowledge about the institutions main research field
for i in range(len(institution_data)):
    try:
        institution_data.iloc[i, institution_data.columns.get_loc('institution_main_research_field')] = (list(institution_data['institution_main_research_field'])[i][0]['display_name'])    
    except:
        institution_data.iloc[i, institution_data.columns.get_loc('institution_main_research_field')] = "Unknown"

institution_data = institution_data.reset_index()
institution_data = institution_data.drop(columns = 'index')
        
institution_data.institution_name = institution_data.institution_name.str.replace('[^a-zA-Z]', ' ')
institution_data.institution_name = institution_data.institution_name.str.replace('  ',' ')

  institution_data.institution_name = institution_data.institution_name.str.replace('[^a-zA-Z]', ' ')


In [22]:
institution_data

Unnamed: 0,institution_name,institution_relevance_score,country_code,type,institution_cited_by_count,institution_main_research_field
0,King Abdulaziz University,79227.945,SA,education,1829390,Biology
1,University of Vermont,63603.043,US,education,2793275,Biology
2,University of California Davis,212242.7,US,education,15828270,Biology
3,Washington University in St Louis,338272.47,US,education,25461969,Biology
4,University of South Alabama,42624.39,US,education,759868,Biology
...,...,...,...,...,...,...
136,Sunnybrook Health Science Centre,70085.98,CA,healthcare,1266945,Medicine
137,Mount Sinai Health System,13524.845,US,healthcare,72997,Medicine
138,University of Toronto,190424.47,CA,education,24280468,Biology
139,University Health Network,56734.996,CA,healthcare,2475271,Medicine


## Combining the dataframes:

Combining the data was a time consuming task due to smaller errors and mistakes.
1. taking the authors from the metadata and replace their special characters
2. taking the authors from the retrieved authors to gain more knowledge about them
3. compare lost data due to merges (e.g. missing values somewhere or wrong name alignments)
    - 3.1 since the metadata is taken document wise, high amounts of authors could be lost. Then, more have to be retrived
4. merging the data 
5. comparing lost data due to merges in terms of adding the institution
6. merging the data again now containing information about the Article, Authors and Institutions

In [23]:
#replace metadata authors 
metadata_authors = split_authors(50)#len(metadata))
metadata_authors.author = metadata_authors.author.str.replace('[^a-zA-Z]', ' ')
metadata_authors.author = metadata_authors.author.str.replace('  ',' ')
metadata_authors = metadata_authors.rename(columns={'author': 'display_name'})

  author_uid_df = author_uid_df.replace('', np.nan).set_index('cord_uid').stack().reset_index(name='author').drop('level_1',1)
  metadata_authors.author = metadata_authors.author.str.replace('[^a-zA-Z]', ' ')


In [24]:
# renaming columns and replacing special characters e.g. accent
author_data.display_name = author_data.display_name.str.replace('.', '')
author_data.last_known_institution = author_data.last_known_institution.str.replace('[^a-zA-Z]', ' ')
author_data.last_known_institution = author_data.last_known_institution.str.replace('  ',' ')

  author_data.display_name = author_data.display_name.str.replace('.', '')
  author_data.last_known_institution = author_data.last_known_institution.str.replace('[^a-zA-Z]', ' ')


lost data due to name merges:

In [25]:
metadata_authors

Unnamed: 0,cord_uid,display_name,title,license,publish_time,journal
0,ug7v899j,Tariq A Madani,Clinical features of culture-proven Mycoplasma...,no-cc,2001-07-04,BMC Infect Dis
1,ug7v899j,Aisha A Al Ghamdi,Clinical features of culture-proven Mycoplasma...,no-cc,2001-07-04,BMC Infect Dis
2,02tnwd4m,Albert van der Vliet,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res
3,02tnwd4m,Jason P Eiserich,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res
4,02tnwd4m,Carroll E Cross,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res
...,...,...,...,...,...,...
225,58czem0j,C cile M Bensimon,On pandemics and the duty to care: whose duty?...,cc-by,2006-04-20,BMC Med Ethics
226,58czem0j,Mark Bernstein,On pandemics and the duty to care: whose duty?...,cc-by,2006-04-20,BMC Med Ethics
227,58czem0j,Laura Hawryluck,On pandemics and the duty to care: whose duty?...,cc-by,2006-04-20,BMC Med Ethics
228,58czem0j,Randi Zlotnik Shaul,On pandemics and the duty to care: whose duty?...,cc-by,2006-04-20,BMC Med Ethics


In [26]:
author_data

Unnamed: 0,display_name,relevance_score,works_count,cited_by_count,last_known_institution,research_field
0,Tariq A Madani,1559.0482,88,5096,King Abdulaziz University,Medicine
1,Aisha A Al Ghamdi,256.78207,23,102,King Abdulaziz University,Medicine
2,Albert van der Vliet,2384.7302,190,9193,University of Vermont,Biology
3,Jason P Eiserich,2202.5547,99,7941,University of California Davis,Biology
4,Carroll E Cross,2456.3755,300,14160,University of California Davis,Biology
...,...,...,...,...,...,...
219,C cile M Bensimon,486.1169,36,460,University of Toronto,Political science
220,Mark Bernstein,1778.5278,392,10842,University Health Network,Medicine
221,Laura Hawryluck,1502.3676,55,4093,University Health Network,Medicine
222,Randi Zlotnik Shaul,969.5727,53,916,Hospital for Sick Children,Medicine


Lost data due to authors: eventually high if split_authors has a higher value than getAuthors.

In [27]:
list(set(metadata_authors['display_name']) - set(author_data['display_name']))

['Remy Froissart',
 'Clinton L Torres',
 'Reidar Lie',
 'U Arinir',
 'Ming Xu',
 'Hu Zhu',
 'Rob JM Strack van Schijndel',
 'Valeria D Guimaraes',
 'J Martin Pedersen',
 'James Kaysen',
 'Greg Martin',
 'Augustine MK Choi',
 'Cayetano Gonz lez',
 'Daniela A Freitas',
 'Ann Elise O Jordal',
 'TT Bauer',
 'Ching Li',
 'Maric N Oliveira',
 'Yu Hua Ruan',
 'Tom R Slezak',
 'AB Johan Groeneveld',
 'Yi Xu',
 'G Schultze Werninghaus',
 'John Mokili']

In [28]:
#merging our dataframes

combined_data = metadata_authors.merge(author_data, on = 'display_name')
#combined_data2 = combined_data.merge(institution_data, left_on='last_known_institution', right_on = 'institution_name')

In [29]:
combined_data

Unnamed: 0,cord_uid,display_name,title,license,publish_time,journal,relevance_score,works_count,cited_by_count,last_known_institution,research_field
0,ug7v899j,Tariq A Madani,Clinical features of culture-proven Mycoplasma...,no-cc,2001-07-04,BMC Infect Dis,1559.0482,88,5096,King Abdulaziz University,Medicine
1,ug7v899j,Aisha A Al Ghamdi,Clinical features of culture-proven Mycoplasma...,no-cc,2001-07-04,BMC Infect Dis,256.78207,23,102,King Abdulaziz University,Medicine
2,02tnwd4m,Albert van der Vliet,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res,2384.7302,190,9193,University of Vermont,Biology
3,02tnwd4m,Jason P Eiserich,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res,2202.5547,99,7941,University of California Davis,Biology
4,02tnwd4m,Carroll E Cross,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res,2456.3755,300,14160,University of California Davis,Biology
...,...,...,...,...,...,...,...,...,...,...,...
201,58czem0j,C cile M Bensimon,On pandemics and the duty to care: whose duty?...,cc-by,2006-04-20,BMC Med Ethics,486.1169,36,460,University of Toronto,Political science
202,58czem0j,Mark Bernstein,On pandemics and the duty to care: whose duty?...,cc-by,2006-04-20,BMC Med Ethics,1778.5278,392,10842,University Health Network,Medicine
203,58czem0j,Laura Hawryluck,On pandemics and the duty to care: whose duty?...,cc-by,2006-04-20,BMC Med Ethics,1502.3676,55,4093,University Health Network,Medicine
204,58czem0j,Randi Zlotnik Shaul,On pandemics and the duty to care: whose duty?...,cc-by,2006-04-20,BMC Med Ethics,969.5727,53,916,Hospital for Sick Children,Medicine


lost data due to institution:

In [30]:
list(set(combined_data['last_known_institution']) - set(institution_data['institution_name']))

['La Jolla Institute For Allergy  Immunology',
 'Unknown',
 'Sinai Health System',
 'Ottawa Hospital',
 'Xinjiang Technical Institute of Physics  Chemistry']

In [31]:
#institution_data.institution_name = institution_data.institution_name.str.replace('[^a-zA-Z]', ' ')
#institution_data.institution_name = institution_data.institution_name.str.replace('  ',' ')

In [32]:
combined_data2 = combined_data.merge(institution_data, left_on = 'last_known_institution', right_on = 'institution_name')

In [33]:
combined_data2

Unnamed: 0,cord_uid,display_name,title,license,publish_time,journal,relevance_score,works_count,cited_by_count,last_known_institution,research_field,institution_name,institution_relevance_score,country_code,type,institution_cited_by_count,institution_main_research_field
0,ug7v899j,Tariq A Madani,Clinical features of culture-proven Mycoplasma...,no-cc,2001-07-04,BMC Infect Dis,1559.0482,88,5096,King Abdulaziz University,Medicine,King Abdulaziz University,79227.945,SA,education,1829390,Biology
1,ug7v899j,Aisha A Al Ghamdi,Clinical features of culture-proven Mycoplasma...,no-cc,2001-07-04,BMC Infect Dis,256.78207,23,102,King Abdulaziz University,Medicine,King Abdulaziz University,79227.945,SA,education,1829390,Biology
2,02tnwd4m,Albert van der Vliet,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res,2384.7302,190,9193,University of Vermont,Biology,University of Vermont,63603.043,US,education,2793275,Biology
3,02tnwd4m,Jason P Eiserich,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res,2202.5547,99,7941,University of California Davis,Biology,University of California Davis,212242.7,US,education,15828270,Biology
4,02tnwd4m,Carroll E Cross,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res,2456.3755,300,14160,University of California Davis,Biology,University of California Davis,212242.7,US,education,15828270,Biology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,58czem0j,C cile M Bensimon,On pandemics and the duty to care: whose duty?...,cc-by,2006-04-20,BMC Med Ethics,486.1169,36,460,University of Toronto,Political science,University of Toronto,190424.47,CA,education,24280468,Biology
191,58czem0j,Ross EG Upshur,On pandemics and the duty to care: whose duty?...,cc-by,2006-04-20,BMC Med Ethics,,1,0,University of Toronto,Livelihood,University of Toronto,190424.47,CA,education,24280468,Biology
192,58czem0j,Mark Bernstein,On pandemics and the duty to care: whose duty?...,cc-by,2006-04-20,BMC Med Ethics,1778.5278,392,10842,University Health Network,Medicine,University Health Network,56734.996,CA,healthcare,2475271,Medicine
193,58czem0j,Laura Hawryluck,On pandemics and the duty to care: whose duty?...,cc-by,2006-04-20,BMC Med Ethics,1502.3676,55,4093,University Health Network,Medicine,University Health Network,56734.996,CA,healthcare,2475271,Medicine


## Retrieving Journal information

In [34]:
journal_list = combined_data['journal'].unique()

In [35]:
journal_list

array(['BMC Infect Dis', 'Respir Res', 'The EMBO Journal', 'Crit Care',
       'Journal of the American Medical Informatics Association',
       'EMBO J', 'Biol Proced Online', 'J Transl Med', 'Retrovirology',
       'BMC Public Health', 'BMC Genomics',
       'Aust New Zealand Health Policy', 'Microb Cell Fact',
       'Nucleic Acids Res', 'J Biomed Biotechnol', 'BMC Gastroenterol',
       'PLoS Biol', 'Int J Health Geogr', 'Virol J', 'Immunome Res',
       'BMC Mol Biol', 'EMBO reports', 'PLoS Med', 'BMC Biotechnol',
       'Harm Reduct J', 'Nat Med', 'BMC Med Ethics'], dtype=object)

In [36]:
journal_df = pd.DataFrame()

for i in range(len(journal_list)):
    try:
        print(journal_list[i])
        journal_information = requests.get(
            'https://api.openalex.org/venues?filter=display_name.search:'+journal_list[i]
        ).json()['results'][0]
        current_journal = pd.DataFrame.from_dict(journal_information, orient='index')
        current_journal = current_journal.transpose()
        journal_df = journal_df.append(current_journal)
    except:
        print("Empty name")
    #author_data = author_data.append(pd.json_normalize(author_information)[['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution.display_name', 'x_concepts']])
    #author_data[i] = author_information['x_concepts'][1]['display_name']
    
    #author_data.append(pd.json_normalize(tariq)[['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution.display_name', 'x_concepts']])


BMC Infect Dis
Empty name
Respir Res
Empty name
The EMBO Journal
Crit Care
Journal of the American Medical Informatics Association
EMBO J
Empty name
Biol Proced Online
Empty name
J Transl Med
Empty name
Retrovirology
BMC Public Health
BMC Genomics
Aust New Zealand Health Policy
Empty name
Microb Cell Fact
Empty name
Nucleic Acids Res
Empty name
J Biomed Biotechnol
Empty name
BMC Gastroenterol
Empty name
PLoS Biol
Empty name
Int J Health Geogr
Empty name
Virol J
Immunome Res
Empty name
BMC Mol Biol
Empty name
EMBO reports
PLoS Med
Empty name
BMC Biotechnol
Empty name
Harm Reduct J
Empty name
Nat Med
Empty name
BMC Med Ethics
Empty name


In [37]:
journal_df.head()

Unnamed: 0,id,issn_l,issn,display_name,publisher,relevance_score,works_count,cited_by_count,is_oa,is_in_doaj,...,country_code,societies,alternate_titles,abbreviated_title,type,x_concepts,counts_by_year,works_api_url,updated_date,created_date
0,https://openalex.org/V127916151,0261-4189,"[1460-2075, 0261-4189]",The EMBO Journal,EMBO,87910.56,20945,2655687,False,False,...,DE,[],[European Molecular Biology Organization journ...,,journal,"[{'id': 'https://openalex.org/C86803240', 'wik...","[{'year': 2022, 'works_count': 256, 'cited_by_...",https://api.openalex.org/works?filter=host_ven...,2022-12-19T23:50:04.502320,2016-06-24
0,https://openalex.org/V4306506051,,,Chin Crit Care Med,,3061.381,3090,3098,,,...,,[],[],,journal,"[{'id': 'https://openalex.org/C41008148', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=host_ven...,2022-12-19T19:31:47.583283,2022-10-17
0,https://openalex.org/V129839026,1067-5027,"[1067-5027, 1527-974X]",Journal of the American Medical Informatics As...,Oxford University Press,30058.037,4383,199861,False,False,...,GB,[],[JAMIA],,journal,"[{'id': 'https://openalex.org/C41008148', 'wik...","[{'year': 2022, 'works_count': 247, 'cited_by_...",https://api.openalex.org/works?filter=host_ven...,2022-12-20T06:27:11.041242,2016-06-24
0,https://openalex.org/V54115554,1742-4690,[1742-4690],Retrovirology,Springer Nature,11869.897,4683,59152,True,True,...,GB,"[{'url': 'https://www.asv.org/', 'organization...",[],,journal,"[{'id': 'https://openalex.org/C71924100', 'wik...","[{'year': 2022, 'works_count': 26, 'cited_by_c...",https://api.openalex.org/works?filter=host_ven...,2022-12-19T21:14:44.048077,2016-06-24
0,https://openalex.org/V200437886,1471-2458,[1471-2458],BMC Public Health,Springer Nature,44234.445,20950,516088,True,True,...,GB,[],"[BioMed Central public health, Public health]",,journal,"[{'id': 'https://openalex.org/C71924100', 'wik...","[{'year': 2022, 'works_count': 2237, 'cited_by...",https://api.openalex.org/works?filter=host_ven...,2022-12-19T21:36:19.423618,2016-06-24


In [38]:
journal_data = journal_df[['issn', 'display_name', 'publisher', 'relevance_score', 'cited_by_count', 'cited_by_count','x_concepts']]
journal_data = journal_data.rename(columns = {'display_name':'journal_display_name', 'publisher': 'journal_publisher', 'relevance_score': 'journal_relevance_score', 'cited_by_count' : 'journal_cited_by_count', 'x_concepts' : 'main_research_field'})

In [39]:
journal_data

Unnamed: 0,issn,journal_display_name,journal_publisher,journal_relevance_score,journal_cited_by_count,journal_cited_by_count.1,main_research_field
0,"[1460-2075, 0261-4189]",The EMBO Journal,EMBO,87910.56,2655687,2655687,"[{'id': 'https://openalex.org/C86803240', 'wik..."
0,,Chin Crit Care Med,,3061.381,3098,3098,"[{'id': 'https://openalex.org/C41008148', 'wik..."
0,"[1067-5027, 1527-974X]",Journal of the American Medical Informatics As...,Oxford University Press,30058.037,199861,199861,"[{'id': 'https://openalex.org/C41008148', 'wik..."
0,[1742-4690],Retrovirology,Springer Nature,11869.897,59152,59152,"[{'id': 'https://openalex.org/C71924100', 'wik..."
0,[1471-2458],BMC Public Health,Springer Nature,44234.445,516088,516088,"[{'id': 'https://openalex.org/C71924100', 'wik..."
0,[1471-2164],BMC Genomics,Springer Nature,44381.37,543264,543264,"[{'id': 'https://openalex.org/C86803240', 'wik..."
0,,Int J Virol,,391.1534,319,319,"[{'id': 'https://openalex.org/C71924100', 'wik..."
0,"[1469-221X, 1469-3178]",EMBO Reports,EMBO,33726.918,270058,270058,"[{'id': 'https://openalex.org/C86803240', 'wik..."


In [40]:
for i in range(len(journal_data)):
    try:
        journal_data.iloc[i, journal_data.columns.get_loc('main_research_field')] = (list(journal_data['main_research_field'])[i][0]['display_name'])    
    except:
        journal_data.iloc[i, journal_data.columns.get_loc('main_research_field')] = "Unknown"

Merging the journal data now to our complete dataframe

In [41]:
# renaming columns and replacing special characters e.g. accent
journal_data.journal_display_name = journal_data.journal_display_name.str.replace('.', '')
journal_data.journal_display_name = journal_data.journal_display_name.str.replace('[^a-zA-Z]', ' ')
journal_data.journal_display_name = journal_data.journal_display_name.str.replace('  ',' ')

  journal_data.journal_display_name = journal_data.journal_display_name.str.replace('.', '')
  journal_data.journal_display_name = journal_data.journal_display_name.str.replace('[^a-zA-Z]', ' ')


In [42]:
list(set(combined_data2['journal']) - set(journal_data['journal_display_name']))

['PLoS Med',
 'J Biomed Biotechnol',
 'Virol J',
 'Respir Res',
 'Crit Care',
 'Biol Proced Online',
 'PLoS Biol',
 'Nat Med',
 'Aust New Zealand Health Policy',
 'BMC Biotechnol',
 'BMC Med Ethics',
 'Immunome Res',
 'Nucleic Acids Res',
 'EMBO J',
 'BMC Gastroenterol',
 'J Transl Med',
 'EMBO reports',
 'BMC Infect Dis',
 'BMC Mol Biol',
 'Int J Health Geogr',
 'Microb Cell Fact',
 'Harm Reduct J']

In [43]:
combined_data3 = combined_data2.merge(journal_data, left_on = 'journal', right_on = 'journal_display_name')

### Our completed Dataframe consisting of: 



In [44]:
combined_data3

Unnamed: 0,cord_uid,display_name,title,license,publish_time,journal,relevance_score,works_count,cited_by_count,last_known_institution,...,type,institution_cited_by_count,institution_main_research_field,issn,journal_display_name,journal_publisher,journal_relevance_score,journal_cited_by_count,journal_cited_by_count.1,main_research_field
0,zjufx4fo,Alexander O Pasternak,Sequence requirements for RNA strand transfer ...,green-oa,2001-12-17,The EMBO Journal,754.8022,59,1250,University of Amsterdam,...,education,10296107,Biology,"[1460-2075, 0261-4189]",The EMBO Journal,EMBO,87910.56,2655687,2655687,Biology
1,zjufx4fo,Erwin van den Born,Sequence requirements for RNA strand transfer ...,green-oa,2001-12-17,The EMBO Journal,610.2445,17,525,MSD,...,company,4379798,Biology,"[1460-2075, 0261-4189]",The EMBO Journal,EMBO,87910.56,2655687,2655687,Biology
2,zjufx4fo,Willy J M Spaan,Sequence requirements for RNA strand transfer ...,green-oa,2001-12-17,The EMBO Journal,1875.467,149,6336,Leiden University Medical Center,...,healthcare,5546371,Medicine,"[1460-2075, 0261-4189]",The EMBO Journal,EMBO,87910.56,2655687,2655687,Biology
3,zjufx4fo,Eric J Snijder,Sequence requirements for RNA strand transfer ...,green-oa,2001-12-17,The EMBO Journal,2510.1736,284,13532,Leiden University Medical Center,...,healthcare,5546371,Medicine,"[1460-2075, 0261-4189]",The EMBO Journal,EMBO,87910.56,2655687,2655687,Biology
4,6lvn10f4,Peiyong Huang,Heterogeneous nuclear ribonucleoprotein A1 reg...,green-oa,2000-09-01,The EMBO Journal,263.47144,3,167,Howard Hughes Medical Institute,...,nonprofit,18755284,Biology,"[1460-2075, 0261-4189]",The EMBO Journal,EMBO,87910.56,2655687,2655687,Biology
5,6lvn10f4,Stephanie T Shi,Heterogeneous nuclear ribonucleoprotein A1 reg...,green-oa,2000-09-01,The EMBO Journal,755.40283,20,1937,Pfizer,...,company,3172677,Biology,"[1460-2075, 0261-4189]",The EMBO Journal,EMBO,87910.56,2655687,2655687,Biology
6,6lvn10f4,Hsin Pai Li,Heterogeneous nuclear ribonucleoprotein A1 reg...,green-oa,2000-09-01,The EMBO Journal,795.6522,42,1399,Chang Gung University,...,education,1722830,Medicine,"[1460-2075, 0261-4189]",The EMBO Journal,EMBO,87910.56,2655687,2655687,Biology
7,6lvn10f4,Michael M C Lai,Heterogeneous nuclear ribonucleoprotein A1 reg...,green-oa,2000-09-01,The EMBO Journal,1765.2999,238,11555,Institute of Molecular Biology Academia Sinica,...,facility,191453,Biology,"[1460-2075, 0261-4189]",The EMBO Journal,EMBO,87910.56,2655687,2655687,Biology
8,mcuixluu,Aspasia Ploubidou,Vaccinia virus infection disrupts microtubule ...,green-oa,2000-08-01,The EMBO Journal,713.26697,4,552,European Molecular Biology Laboratory,...,government,1144706,Biology,"[1460-2075, 0261-4189]",The EMBO Journal,EMBO,87910.56,2655687,2655687,Biology
9,mcuixluu,Inge Reckmann,Vaccinia virus infection disrupts microtubule ...,green-oa,2000-08-01,The EMBO Journal,860.9829,12,1117,European Molecular Biology Laboratory,...,government,1144706,Biology,"[1460-2075, 0261-4189]",The EMBO Journal,EMBO,87910.56,2655687,2655687,Biology


In [45]:
combined_data3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37 entries, 0 to 36
Data columns (total 24 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   cord_uid                         37 non-null     object
 1   display_name                     37 non-null     object
 2   title                            37 non-null     object
 3   license                          37 non-null     object
 4   publish_time                     37 non-null     object
 5   journal                          37 non-null     object
 6   relevance_score                  37 non-null     object
 7   works_count                      37 non-null     object
 8   cited_by_count                   37 non-null     object
 9   last_known_institution           37 non-null     object
 10  research_field                   37 non-null     object
 11  institution_name                 37 non-null     object
 12  institution_relevance_score      37 no