In [1]:
import pandas as pd
import pyterrier as pt
import os
import requests
import json
import re
import csv
import numpy as np

In [2]:
if not pt.started():
  pt.init()

PyTerrier 0.9.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
dataset = pt.datasets.get_dataset('irds:cord19/trec-covid')
pt_index_path = './indices/cord19'

if not os.path.exists(pt_index_path + "/data.properties"):
  indexer = pt.index.IterDictIndexer(pt_index_path, blocks = True)
  index_ref = indexer.index(dataset.get_corpus_iter(),
                            fields = ['title', 'doi', 'abstract'],
                            meta = ('docno',))
else:
    index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")

index = pt.IndexFactory.of(index_ref)

In [4]:
metadata = pd.read_csv('~/.ir_datasets/cord19/2020-07-16/metadata.csv')
#splitted_authors = pd.read_csv('./split_authors.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
len(metadata)

192509

In [6]:
metadata.head(5)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636.0,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967.0,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972.0,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871.0,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,PMC,Gene expression in epithelial cells in respons...,10.1186/rr61,PMC59580,11686888.0,no-cc,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,,,,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,


Dropping empty autors

In [7]:
metadata = metadata[metadata['authors'].notna()]
metadata = metadata.reset_index()
metadata = metadata.drop(columns = 'index')

In [8]:
len(metadata)

186032

## Splitting the authors into single columns

A function to split the author dict up into single columns. To make the retrieval process possible, commas, double spaces and other characters had to be removed without using a complex regex solution (in case a person hat a accent in their name for example.

In [9]:
def split_authors(n):
    # get first n articles
    author_uid_df = metadata[['authors', 'cord_uid']][:n]
    author_uid_df['authors'] = author_uid_df['authors'].apply(pd.Series)
    # stringsplit authors
    splitted_authors_df = author_uid_df['authors'].str.split(';', expand=True)
    author_uid_df = pd.concat([author_uid_df, splitted_authors_df], axis=1)
    # drop the authors (column with multiple authors)
    author_uid_df.drop(columns='authors')
    # replace the empty column of multiply authors
    #author_title_df = author_title_df.replace('', np.nan).set_index('title')
    author_uid_df = author_uid_df.drop(columns='authors')
    author_uid_df = author_uid_df.replace('', np.nan).set_index('cord_uid').stack().reset_index(name='author').drop('level_1',1)
    author_uid_df['author'] = author_uid_df['author'].str.split(',').str[::-1].str.join(' ')

    
    return author_uid_df

In [10]:
def split_authors(n):
    # get first n articles
    author_uid_df = metadata[['authors', 'cord_uid']][:n]
    author_uid_df['authors'] = author_uid_df['authors'].apply(pd.Series)
    # stringsplit authors
    splitted_authors_df = author_uid_df['authors'].str.split(';', expand=True)
    #splitted_authors_df = splitted_authors_df.replace(',','', regex=True)
    # concatenate the splitted authors to our dataframe
    author_uid_df = pd.concat([author_uid_df, splitted_authors_df], axis=1)
    # drop the authors (column with multiple authors)
    author_uid_df.drop(columns='authors')
    # replace the empty column of multiply authors
    #author_title_df = author_title_df.replace('', np.nan).set_index('title')
    author_uid_df = author_uid_df.drop(columns='authors')
    #author_title_df = author_title_df.reset_index()
    author_uid_df = author_uid_df.replace('', np.nan).set_index('cord_uid').stack().reset_index(name='author').drop('level_1',1)
    author_uid_df['author'] = author_uid_df['author'].str.split(',').str[::-1].str.join(' ')
    author_uid_df['author'] = author_uid_df['author'].str.replace('  ', ' ')
    author_uid_df['author'] = author_uid_df['author'].str.lstrip()
    merged_data = metadata[['cord_uid', 'title', 'license', 'publish_time', 'journal']]
    merged_data = author_uid_df.merge(merged_data, on = 'cord_uid')
    return merged_data

### How to work with the API:

Basic request:
https://api.openalex.org/ + work/authors/venues/insitutions/concecpts + ?filter = + columns.search + Name

In [11]:
tariq = requests.get(
    'https://api.openalex.org/authors?filter=display_name.search:Tariq'
).json()['results'][0] 

#tariq                      

## Retrieving author information

A function to retrieve all the authors from the metadata's first n documents. It creates a list to call OpenAlex with.

In [12]:
def getAuthors(n_documents):
    author_list = []
    for i in range(n_documents):
        print(metadata['authors'][i])
        if(metadata['authors'][i] != "nan"):#isinstance(i,float)):
            authors_split = metadata['authors'][i].split(";")
            for j in range(len(authors_split)):
                #authors_split[j] = re.sub(r'\W+', ' ', authors_split[j])
                #authors_split[j] = authors_split[j].replace('-',' ')
                authors_split[j] = authors_split[j].replace(',', '')
                authors_split[j] = authors_split[j].lstrip()
                author_list.append(authors_split[j])
        else:
            author_list.append("")
    return author_list

In [13]:
authors = getAuthors(500)

Madani, Tariq A; Al-Ghamdi, Aisha A
Vliet, Albert van der; Eiserich, Jason P; Cross, Carroll E
Crouch, Erika C
Fagan, Karen A; McMurtry, Ivan F; Rodman, David M
Domachowske, Joseph B; Bonville, Cynthia A; Rosenberg, Helene F
Pasternak, Alexander O.; van den Born, Erwin; Spaan, Willy J.M.; Snijder, Eric J.
Alvarez, Gonzalo; Hébert, Paul C; Szick, Sharyn
Ball, Jonathan; Venn, Richard
Slebos, Dirk-Jan; Ryter, Stefan W; Choi, Augustine MK
Tsui, Fu-Chiang; Espino, Jeremy U.; Dato, Virginia M.; Gesteland, Per H.; Hutman, Judith; Wagner, Michael M.
Ivanov, Ivaylo P.; Matsufuji, Senya; Murakami, Yasuko; Gesteland, Raymond F.; Atkins, John F.
Shi, Stephanie T.; Huang, Peiyong; Li, Hsin-Pai; Lai, Michael M.C.
Pridgeon, Julia W.; Geetha, Thangiah; Wooten, Marie W.
Ploubidou, Aspasia; Moreau, Violaine; Ashman, Keith; Reckmann, Inge; González, Cayetano; Way, Michael
Barry, John M
Shieh, Biehuoy; Li, Ching
Verheij, Joanne; Groeneveld, AB Johan; Beishuizen, Albertus; Lingen, Arthur van; Simoons-Smit,

In [14]:
authors

['Madani Tariq A',
 'Al-Ghamdi Aisha A',
 'Vliet Albert van der',
 'Eiserich Jason P',
 'Cross Carroll E',
 'Crouch Erika C',
 'Fagan Karen A',
 'McMurtry Ivan F',
 'Rodman David M',
 'Domachowske Joseph B',
 'Bonville Cynthia A',
 'Rosenberg Helene F',
 'Pasternak Alexander O.',
 'van den Born Erwin',
 'Spaan Willy J.M.',
 'Snijder Eric J.',
 'Alvarez Gonzalo',
 'Hébert Paul C',
 'Szick Sharyn',
 'Ball Jonathan',
 'Venn Richard',
 'Slebos Dirk-Jan',
 'Ryter Stefan W',
 'Choi Augustine MK',
 'Tsui Fu-Chiang',
 'Espino Jeremy U.',
 'Dato Virginia M.',
 'Gesteland Per H.',
 'Hutman Judith',
 'Wagner Michael M.',
 'Ivanov Ivaylo P.',
 'Matsufuji Senya',
 'Murakami Yasuko',
 'Gesteland Raymond F.',
 'Atkins John F.',
 'Shi Stephanie T.',
 'Huang Peiyong',
 'Li Hsin-Pai',
 'Lai Michael M.C.',
 'Pridgeon Julia W.',
 'Geetha Thangiah',
 'Wooten Marie W.',
 'Ploubidou Aspasia',
 'Moreau Violaine',
 'Ashman Keith',
 'Reckmann Inge',
 'González Cayetano',
 'Way Michael',
 'Barry John M',
 'Shieh

Retrieving the data using OpenAlex. If the author is not found, Empty Name is used as dummy.

In [15]:
len(authors)

2915

In [16]:
author_df = pd.DataFrame()

for i in range(len(authors)):
    try:
        author_information = requests.get(
            'https://api.openalex.org/authors?filter=display_name.search:'+authors[i]
        ).json()['results'][0]
        print('Retrieved:',authors[i])
        current_author = pd.DataFrame.from_dict(author_information, orient='index')
        current_author = current_author.transpose()
        author_df = author_df.append(current_author)
        author_df['display_name'] =  author_df['display_name'].str.lstrip()

        
    except:
        print("Empty name")
    #author_data = author_data.append(pd.json_normalize(author_information)[['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution.display_name', 'x_concepts']])
    #author_data[i] = author_information['x_concepts'][1]['display_name']
    
    #author_data.append(pd.json_normalize(tariq)[['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution.display_name', 'x_concepts']])


Retrieved: Madani Tariq A
Retrieved: Al-Ghamdi Aisha A
Retrieved: Vliet Albert van der
Retrieved: Eiserich Jason P
Retrieved: Cross Carroll E
Retrieved: Crouch Erika C
Retrieved: Fagan Karen A
Retrieved: McMurtry Ivan F
Retrieved: Rodman David M
Retrieved: Domachowske Joseph B
Retrieved: Bonville Cynthia A
Retrieved: Rosenberg Helene F
Retrieved: Pasternak Alexander O.
Retrieved: van den Born Erwin
Retrieved: Spaan Willy J.M.
Retrieved: Snijder Eric J.
Retrieved: Alvarez Gonzalo
Retrieved: Hébert Paul C
Retrieved: Szick Sharyn
Retrieved: Ball Jonathan
Retrieved: Venn Richard
Retrieved: Slebos Dirk-Jan
Retrieved: Ryter Stefan W
Retrieved: Choi Augustine MK
Retrieved: Tsui Fu-Chiang
Retrieved: Espino Jeremy U.
Retrieved: Dato Virginia M.
Retrieved: Gesteland Per H.
Retrieved: Hutman Judith
Retrieved: Wagner Michael M.
Retrieved: Ivanov Ivaylo P.
Retrieved: Matsufuji Senya
Retrieved: Murakami Yasuko
Retrieved: Gesteland Raymond F.
Retrieved: Atkins John F.
Retrieved: Shi Stephanie T.
Retr

Retrieved: Benelli Dario
Retrieved: Londei Paola
Retrieved: Flagiello Angela
Retrieved: Monti Maria
Retrieved: Pucci Piero
Retrieved: Rossi Mosè
Retrieved: Moracci Marco
Retrieved: Jeang Kuan-Teh
Retrieved: Yedavalli Venkat
Retrieved: Huang Jian
Retrieved: Gutteridge Alex
Retrieved: Honda Wataru
Retrieved: Kanehisa Minoru
Retrieved: Armstrong Regina C.
Retrieved: Le Tuan Q.
Retrieved: Flint Nicole C.
Retrieved: Vana Adam C.
Retrieved: Zhou Yong-Xing
Retrieved: Malanoski Anthony P.
Retrieved: Lin Baochuan
Retrieved: Wang Zheng
Retrieved: Schnur Joel M.
Retrieved: Stenger David A.
Retrieved: McCrate Nina E.
Retrieved: Varner Mychel E.
Retrieved: Kim Kenneth I.
Retrieved: Nagan Maria C.
Retrieved: Kutyavin Igor V.
Retrieved: Milesi Dave
Retrieved: Belousov Yevgeniy
Retrieved: Podyminogin Mikhail
Retrieved: Vorobiev Alexei
Retrieved: Gorn Vladimir
Retrieved: Lukhtanov Eugeny A.
Retrieved: Vermeulen Nicolaas M. J.
Retrieved: Mahoney Walt
Retrieved: Thompson Alison K
Retrieved: Faith Karen
R

Retrieved: Ng Vivian
Retrieved: Randolph Adrienne
Retrieved: Cook Deborah J
Retrieved: Sandrock Christian
Retrieved: Kelly Terra
Retrieved: Challen Kirsty
Retrieved: Bentley Andrew
Retrieved: Bright John
Retrieved: Walter Darren
Retrieved: Sarikaya Ozlem
Retrieved: Erbaydar Tugrul
Retrieved: Paget John
Retrieved: Marquet Richard
Retrieved: Meijer Adam
Retrieved: van der Velden Koos
Retrieved: Claessens Yann-Erick
Retrieved: Dhainaut Jean-François
Retrieved: Eid Luminita
Retrieved: Bromberg Zohar
Retrieved: EL-Latif Mahmoud Abd
Retrieved: Zeira Evelyn
Retrieved: Oppenheim Ariella
Retrieved: Weiss Yoram G
Retrieved: Yan Yuhe
Retrieved: Jung Yong T
Retrieved: Wu Tiyun
Retrieved: Kozak Christine A
Retrieved: Gendron Karine
Retrieved: Charbonneau Johanie
Retrieved: Dulude Dominic
Retrieved: Heveker Nikolaus
Retrieved: Ferbeyre Gerardo
Retrieved: Brakier-Gingras Léa
Retrieved: Wu Tsung-Shu Joseph
Retrieved: Shih Fuh-Yuan Frank
Retrieved: Yen Muh-Yong
Retrieved: Wu Jiunn-Shyan Julian
Retrieve

Retrieved: Perlman Stanley
Retrieved: Vali Bahareh
Retrieved: Yue Feng Yun
Retrieved: Jones R. Brad
Retrieved: Sheth Prameet M.
Retrieved: Kaul Rupert
Retrieved: Betts Michael R.
Retrieved: Wong David
Retrieved: Kovacs Colin
Retrieved: Loutfy Mona
Retrieved: Common Andrew
Retrieved: Halpenny Roberta
Retrieved: Ostrowski Mario A.
Retrieved: Mayordomo-Colunga Juan
Retrieved: Rey Corsino
Retrieved: González Soledad
Retrieved: Concha Andrés
Retrieved: Qin Jian
Retrieved: Jones Robert C.
Retrieved: Ramakrishnan Ramesh
Retrieved: Li Xingming
Retrieved: Huang Jianshi
Retrieved: Zhang Hui
Retrieved: Bigot Yves
Retrieved: Samain Sylvie
Retrieved: Augé-Gouillou Corinne
Retrieved: Federici Brian A
Retrieved: Croyle Maria A.
Retrieved: Patel Ami
Retrieved: Tran Kaylie N.
Retrieved: Gray Michael
Retrieved: Zhang Yi
Retrieved: Strong James E.
Retrieved: Feldmann Heinz
Retrieved: Kobinger Gary P.
Retrieved: Dare Ryan K.
Retrieved: Chittaganpitch Malinee
Retrieved: Erdman Dean D.
Retrieved: Paranthama

Retrieved: Abdullah Mahdi
Retrieved: Lang Dagmar S
Retrieved: Stellmacher Florian
Retrieved: Vollmer Ekkehard
Retrieved: Srivastava Renu
Retrieved: Liu Jian-Xiang
Retrieved: Howell Stephen H
Retrieved: Kirillova Svetlana
Retrieved: Kumar Suresh
Retrieved: Carugo Oliviero
Retrieved: do Vale Gomes Ana Lisa
Retrieved: Magalhães Cintia
Retrieved: Melo Fernando
Retrieved: Inga Rocio
Retrieved: Gadelha Sandra R.
Retrieved: Tokars Jerome I.
Retrieved: Burkom Howard
Retrieved: Xing Jian
Retrieved: English Roseanne
Retrieved: Bloom Steven
Retrieved: Cox Kenneth
Retrieved: Pavlin Julie A.
Retrieved: Simon-Loriere Etienne
Retrieved: Galetto Roman
Retrieved: Hamoudi Meriem
Retrieved: Archer John
Retrieved: Lefeuvre Pierre
Retrieved: Martin Darren P.
Retrieved: Robertson David L.
Retrieved: Negroni Matteo
Retrieved: Scull Margaret A.
Retrieved: Gillim-Ross Laura
Retrieved: Santos Celia
Retrieved: Roberts Kim L.
Retrieved: Bordonali Elena
Retrieved: Subbarao Kanta
Retrieved: Barclay Wendy S.
Retriev

Retrieved: Rong Rong
Retrieved: Li Bing
Retrieved: Lynch Rebecca M.
Retrieved: Haaland Richard E.
Retrieved: Murphy Megan K.
Retrieved: Mulenga Joseph
Retrieved: Allen Susan A.
Retrieved: Pinter Abraham
Retrieved: Shaw George M.
Retrieved: Hunter Eric
Retrieved: Robinson James E.
Retrieved: Gnanakaran S.
Retrieved: Derdeyn Cynthia A.
Retrieved: Tong Yufeng
Retrieved: Tempel Wolfram
Retrieved: Nedyalkova Lyudmila
Retrieved: MacKenzie Farrell
Retrieved: Park Hee-Won
Retrieved: Dizney Laurie J.
Retrieved: Ruedas Luis A.
Retrieved: Puranaveja Suphasawatt
Retrieved: Poolperm Pariwat
Retrieved: Lertwatcharasarakul Preeda
Retrieved: Kesdaengsakonwut Sawang
Retrieved: Boonsoongnern Alongkot
Retrieved: Urairong Kitcha
Retrieved: Kitikoon Pravina
Retrieved: Choojai Porjit
Retrieved: Kedkovid Roongtham
Retrieved: Teankum Komkrich
Retrieved: Thanawongnuwech Roongroje
Retrieved: Moyer Cheryl A
Retrieved: Yang Huixia
Retrieved: Kwawukume Yao
Retrieved: Gupta Anu
Retrieved: Zhu YuChun
Retrieved: Kora

Retrieved: Weiss Robin A.
Retrieved: Lanzavecchia Antonio
Retrieved: Bekaert Michaël
Retrieved: Firth Andrew E.
Retrieved: Zhang Yan
Retrieved: Gladyshev Vadim N.
Retrieved: Atkins John F.
Retrieved: Baranov Pavel V.
Retrieved: Pellet J.
Retrieved: Tafforeau L.
Empty name
Retrieved: Navratil V.
Retrieved: Meyniel L.
Retrieved: Achaz G.
Retrieved: Guironnet-Paquet A.
Empty name
Retrieved: Caignard G.
Retrieved: Cassonnet P.
Retrieved: Chaboud A.
Retrieved: Chantier T.
Retrieved: Deloire A.
Retrieved: Demeret C.
Retrieved: Le Breton M.
Retrieved: Neveu G.
Retrieved: Jacotot L.
Retrieved: Vaglio P.
Retrieved: Delmotte S.
Retrieved: Gautier C.
Retrieved: Combet C.
Retrieved: Deleage G.
Retrieved: Favre M.
Retrieved: Tangy F.
Retrieved: Jacob Y.
Retrieved: Andre P.
Retrieved: Lotteau V.
Retrieved: Rabourdin-Combe C.
Retrieved: Vidalain P. O.
Retrieved: Rello Jordi
Retrieved: Pop-Vicas Aurora
Retrieved: Wagner Bradley G
Empty name
Retrieved: Blower Sally
Retrieved: Suthar Mehul S.
Retrieved:

Retrieved: Waterman Steve
Retrieved: Uyeki Timothy
Retrieved: Azziz-Baumgartner Eduardo
Retrieved: Lee Nelson
Retrieved: Wong Chun Kwok
Retrieved: Chan Paul K.S.
Retrieved: Lindegardh Niklas
Retrieved: White Nicholas J.
Retrieved: Hayden Frederick G.
Retrieved: Wong Edward H.C.
Retrieved: Wong Ka Shing
Retrieved: Cockram Clive S.
Retrieved: Sung Joseph J.Y.
Retrieved: Hui David S.C.
Retrieved: Lee Charlie Wah Heng
Retrieved: Koh Chee Wee
Retrieved: Chan Yang Sun
Retrieved: Aw Pauline Poh Kim
Retrieved: Loh Kuan Hon
Retrieved: Han Bing Ling
Retrieved: Thien Pei Ling
Retrieved: Nai Geraldine Yi Wen
Retrieved: Hibberd Martin L.
Retrieved: Wong Christopher W.
Retrieved: Sung Wing-Kin
Retrieved: Hsieh Ying-Hen
Retrieved: Wood James G.
Retrieved: Zamani Nasim
Retrieved: MacIntyre C. Raina
Retrieved: Becker Niels G.
Retrieved: Peris Adriano
Retrieved: Cianchi Giovanni
Retrieved: Biondi Simona
Retrieved: Bonizzoli Manuela
Retrieved: Pasquini Andrea
Retrieved: Bonacchi Massimo
Retrieved: Ciapet

Retrieved: Nicoll Angus
Retrieved: Pan Xin
Retrieved: Qi Jian-cheng
Retrieved: Long Ming
Retrieved: Liang Hao
Retrieved: Chen Xiao
Retrieved: Li Han
Retrieved: Li Guang-bo
Retrieved: Zheng Hao
Retrieved: Chandramouli Kondethimmanahalli
Retrieved: Qian Pei-Yuan
Retrieved: Gao Rongbao
Retrieved: Dong Libo
Retrieved: Dong Jie
Retrieved: Wen Leying
Retrieved: Zhang Ye
Retrieved: Yu Hongjie
Retrieved: Feng Zijian
Retrieved: Chen Minmei
Retrieved: Tan Yi
Retrieved: Mo Zhaojun
Retrieved: Liu Haiyan
Retrieved: Fan Yunyan
Retrieved: Li Kunxiong
Retrieved: Li Chris Ka-Fai
Retrieved: Li Dexin
Retrieved: Yang Weizhong
Retrieved: Shu Yuelong
Retrieved: Liao Qiuyan
Retrieved: Cowling Benjamin
Retrieved: Lam Wing Tak
Retrieved: Ng Man Wai
Retrieved: Fielding Richard
Retrieved: Xiao Xiaodong
Retrieved: Zhu Zhongyu
Retrieved: Dankmeyer Jennifer L.
Retrieved: Wormald Michael M.
Retrieved: Fast Randy L.
Retrieved: Worsham Patricia L.
Retrieved: Cote Christopher K.
Retrieved: Amemiya Kei
Retrieved: Dimitr

Retrieved: Siddon Nicole A.
Retrieved: Hyatt Alex D.
Retrieved: Bystrom Jonas
Retrieved: Amin Kawa
Retrieved: Bishop-Bailey David
Retrieved: Hsieh Ying-Hen
Retrieved: Chan Chi-Ho
Retrieved: Miller Scott A.
Retrieved: Hiatt Leslie A.
Retrieved: Keil Robert G.
Retrieved: Wright David W.
Retrieved: Cliffel David E.
Retrieved: Riaz Shah Shahida Amjad
Retrieved: Idrees Muhammad
Retrieved: Hussain Abrar
Retrieved: Garske Tini
Retrieved: Yu Hongjie
Retrieved: Peng Zhibin
Retrieved: Ye Min
Retrieved: Zhou Hang
Retrieved: Cheng Xiaowen
Retrieved: Wu Jiabing
Retrieved: Ferguson Neil
Retrieved: Eggo Rosalind M.
Retrieved: Cauchemez Simon
Retrieved: Ferguson Neil M.
Retrieved: Lamb Daniel
Retrieved: Schüttelkopf Alexander W.
Retrieved: van Aalten Daan M. F.
Retrieved: Brighty David W.
Retrieved: Caprotta Gustavo
Retrieved: Gonzalez Crotti Patricia
Retrieved: Frydman Judith
Retrieved: Gendron Karine
Retrieved: Ferbeyre Gerardo
Retrieved: Heveker Nikolaus
Retrieved: Brakier-Gingras Léa
Retrieved: Po

In [17]:
author_df

Unnamed: 0,id,orcid,display_name,display_name_alternatives,relevance_score,works_count,cited_by_count,ids,last_known_institution,x_concepts,counts_by_year,works_api_url,updated_date,created_date
0,https://openalex.org/A2000431824,https://orcid.org/0000-0003-2453-0623,Tariq A. Madani,[Tariq A. Madani],3130.199,88,5115,{'openalex': 'https://openalex.org/A2000431824...,"{'id': 'https://openalex.org/I185163786', 'ror...","[{'id': 'https://openalex.org/C71924100', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-31T11:18:09.811600,2016-06-24
0,https://openalex.org/A2181556405,https://orcid.org/0000-0001-8106-0424,Aisha A. Al-Ghamdi,[],513.7164,23,102,{'openalex': 'https://openalex.org/A2181556405...,"{'id': 'https://openalex.org/I185163786', 'ror...","[{'id': 'https://openalex.org/C71924100', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-11-23T15:53:37.889422,2016-06-24
0,https://openalex.org/A2134341530,https://orcid.org/0000-0003-0923-0016,Albert van der Vliet,[],4789.768,190,9210,{'openalex': 'https://openalex.org/A2134341530...,"{'id': 'https://openalex.org/I111236770', 'ror...","[{'id': 'https://openalex.org/C86803240', 'wik...","[{'year': 2022, 'works_count': 1, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-25T21:09:00.511261,2016-06-24
0,https://openalex.org/A62037741,,Jason P. Eiserich,[],4468.0376,99,7951,"{'openalex': 'https://openalex.org/A62037741',...","{'id': 'https://openalex.org/I84218800', 'ror'...","[{'id': 'https://openalex.org/C86803240', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-28T23:18:57.646967,2016-06-24
0,https://openalex.org/A2098199652,,Carroll E. Cross,[],4917.113,300,14172,{'openalex': 'https://openalex.org/A2098199652...,"{'id': 'https://openalex.org/I84218800', 'ror'...","[{'id': 'https://openalex.org/C86803240', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-31T01:43:37.938942,2016-06-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,https://openalex.org/A2573029335,https://orcid.org/0000-0002-3649-6680,Yuan Soon Ho,[],2640.5466,157,5230,{'openalex': 'https://openalex.org/A2573029335...,"{'id': 'https://openalex.org/I47519274', 'ror'...","[{'id': 'https://openalex.org/C86803240', 'wik...","[{'year': 2022, 'works_count': 2, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-25T20:38:44.699928,2017-01-26
0,https://openalex.org/A3048868603,https://orcid.org/0000-0002-8859-5453,Lu Zhang,[],2513.0881,597,11048,{'openalex': 'https://openalex.org/A3048868603...,"{'id': 'https://openalex.org/I145897649', 'ror...","[{'id': 'https://openalex.org/C121332964', 'wi...","[{'year': 2022, 'works_count': 14, 'cited_by_c...",https://api.openalex.org/works?filter=author.i...,2022-12-23T03:38:58.241444,2020-08-18
0,https://openalex.org/A3021601967,,Dongyue Li,[],1056.1853,10,809,{'openalex': 'https://openalex.org/A3021601967...,"{'id': 'https://openalex.org/I92403157', 'ror'...","[{'id': 'https://openalex.org/C159985019', 'wi...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-31T05:33:48.199491,2020-05-13
0,https://openalex.org/A2653142792,,Shuqian Luo,[],618.22217,8,205,{'openalex': 'https://openalex.org/A2653142792...,"{'id': 'https://openalex.org/I183519381', 'ror...","[{'id': 'https://openalex.org/C31972630', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=author.i...,2022-12-15T03:37:35.234090,2017-06-30


Cleaning up the author data: 

1. Defining a dataframe
2. add the extracted information into our dataframe
3. replace special characters in the display name
4. add the x_concepts (most common research field of the author according to openalex)

In [18]:
author_data = pd.DataFrame(columns= [['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution', 'x_concepts']])
author_data = author_df[['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution', 'x_concepts']]

author_data['research_field'] = ""
author_data = author_data.reset_index()
author_data = author_data.drop(columns = 'index')
#author_data['counter'] = range(len(author_data))
#author_data = author_data.drop(columns='index')

#clean display names again

for i in range(len(author_data)):
    #author_data['display_name'][i] = re.sub(r'\W+', ' ', author_data['display_name'][i])
    #authors_split[j] = authors_split[j].replace('-',' ')
    author_data['display_name'][i] = author_data['display_name'][i].lstrip()
#author_data.display_name = author_data.display_name.str.replace('[^a-zA-Z]', ' ')

for i in range(len(author_data)):
    try:
        author_data.iloc[i, author_data.columns.get_loc('research_field')] = author_data.iloc[i]['x_concepts'][0]['display_name']
    except:
        author_data.iloc[i, author_data.columns.get_loc('research_field')] = "Unknown"
    try:
        author_data.iloc[i, author_data.columns.get_loc('last_known_institution')] = (list(author_data['last_known_institution'])[i]['display_name'])    
    except:
        author_data.iloc[i, author_data.columns.get_loc('last_known_institution')] = "Unknown"
author_data = author_data.drop(columns = ['x_concepts']) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  author_data['research_field'] = ""


In [19]:
author_data

Unnamed: 0,display_name,relevance_score,works_count,cited_by_count,last_known_institution,research_field
0,Tariq A. Madani,3130.199,88,5115,King Abdulaziz University,Medicine
1,Aisha A. Al-Ghamdi,513.7164,23,102,King Abdulaziz University,Medicine
2,Albert van der Vliet,4789.768,190,9210,University of Vermont,Biology
3,Jason P. Eiserich,4468.0376,99,7951,"University of California, Davis",Biology
4,Carroll E. Cross,4917.113,300,14172,"University of California, Davis",Biology
...,...,...,...,...,...,...
2849,Yuan Soon Ho,2640.5466,157,5230,Taipei Medical University,Biology
2850,Lu Zhang,2513.0881,597,11048,Minzu University of China,Physics
2851,Dongyue Li,1056.1853,10,809,University of Science and Technology Beijing,Composite material
2852,Shuqian Luo,618.22217,8,205,Capital Medical University,Computer vision


## Retrieving Insitution Information

Create the institution list from the author data

In [20]:
institutions_list = author_data['last_known_institution'].unique()

Replace commas and spaces at the start to retrieve the documents.

In [21]:
for i in range(len(institutions_list)):
    institutions_list[i] = institutions_list[i].replace(',','')
    institutions_list[i] = institutions_list[i].lstrip()
    print(institutions_list[i])
    #authors_split[j].replace(',','')
     #       authors_split[j] = authors_split[j].lstrip()
      #      author_list.append(authors_split[j])

King Abdulaziz University
University of Vermont
University of California Davis
Washington University in St. Louis
University of South Alabama
ProQR Therapeutics
SUNY Upstate Medical University
National Institute of Allergy and Infectious Diseases
University of Amsterdam
MSD
Utrecht University
Leiden University Medical Center
Instituto de Desarrollo Tecnológico para la Industria Química
University of Montreal
Ottawa Hospital
University of Nottingham
Unknown
University Medical Center Groningen
Cornell College
Children's Hospital of Philadelphia
University of Pittsburgh
University of Utah
Medical University of Sofia
Jikei University School of Medicine
Howard Hughes Medical Institute
University College Cork
Pfizer
Chang Gung University
Agricultural Research Service
Auburn University
European Molecular Biology Laboratory
Inserm
University of Queensland
Institució Catalana de Recerca i Estudis Avançats
The Francis Crick Institute
Oregon Medical Research Center
University of Colorado Denver
B

Create a dataframe and retrieve the institutions using OpenAlex.

In [22]:
institution_df = pd.DataFrame()

for i in range(len(institutions_list)):
    try:
        print('Retrieved:',institutions_list[i])
        institution_information = requests.get(
            'https://api.openalex.org/institutions?filter=display_name.search:'+institutions_list[i]
        ).json()['results'][0]
        current_institution = pd.DataFrame.from_dict(institution_information, orient='index')
        current_institution = current_institution.transpose()
        institution_df = institution_df.append(current_institution)
    except:
        print("Empty name")
    #author_data = author_data.append(pd.json_normalize(author_information)[['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution.display_name', 'x_concepts']])
    #author_data[i] = author_information['x_concepts'][1]['display_name']
    
    #author_data.append(pd.json_normalize(tariq)[['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution.display_name', 'x_concepts']])


Retrieved: King Abdulaziz University
Retrieved: University of Vermont
Retrieved: University of California Davis
Retrieved: Washington University in St. Louis
Retrieved: University of South Alabama
Retrieved: ProQR Therapeutics
Retrieved: SUNY Upstate Medical University
Retrieved: National Institute of Allergy and Infectious Diseases
Retrieved: University of Amsterdam
Retrieved: MSD
Retrieved: Utrecht University
Retrieved: Leiden University Medical Center
Retrieved: Instituto de Desarrollo Tecnológico para la Industria Química
Retrieved: University of Montreal
Retrieved: Ottawa Hospital
Retrieved: University of Nottingham
Retrieved: Unknown
Empty name
Retrieved: University Medical Center Groningen
Retrieved: Cornell College
Retrieved: Children's Hospital of Philadelphia
Retrieved: University of Pittsburgh
Retrieved: University of Utah
Retrieved: Medical University of Sofia
Retrieved: Jikei University School of Medicine
Retrieved: Howard Hughes Medical Institute
Retrieved: University Col

Retrieved: Uppsala University
Retrieved: Flinders Medical Centre
Retrieved: St. Elizabeths Hospital
Retrieved: Taipei Tzu Chi Hospital
Retrieved: Chiayi Chang Gung Memorial Hospital
Retrieved: National Health Research Institutes
Retrieved: Instituto Nacional de Investigación y Tecnología Agraria y Alimentaria
Retrieved: Technical University of Madrid
Retrieved: Centro de Biología Molecular Severo Ochoa
Retrieved: National Institute for Public Health and the Environment
Retrieved: University of Oxford
Retrieved: Data61
Retrieved: Australian National University
Retrieved: University of Sydney
Retrieved: Scripps Research Institute
Retrieved: Vaccine Research Center
Retrieved: University of Sydney Faculty of Pharmacy
Retrieved: University of Calgary
Retrieved: Alberta Health
Retrieved: Newcastle University
Retrieved: Nottingham University Hospitals NHS Trust
Retrieved: Stanford University
Retrieved: University of Oregon
Retrieved: Duke University
Retrieved: Boston University
Retrieved: Uni

Retrieved: Queen Mary Hospital
Retrieved: University of Cambridge
Retrieved: St. Paul's Co-educational College
Retrieved: Los Alamos National Laboratory
Retrieved: American College of Medical Genetics
Retrieved: The Pirbright Institute
Retrieved: Newbury College
Retrieved: Guangzhou Higher Education Mega Center
Retrieved: Chinese Academy of Agricultural Sciences
Retrieved: Oak Ridge National Laboratory
Retrieved: Beckman Research Institute
Retrieved: Lanzhou Veterinary Research Institute
Retrieved: University of Sussex
Retrieved: University of Stirling
Retrieved: University of Edinburgh
Retrieved: Arizona State University
Retrieved: International Institute of Molecular and Cell Biology
Retrieved: University of Warsaw
Retrieved: University of Exeter
Retrieved: Purdue University System
Retrieved: University of Kent
Retrieved: European Synchrotron Radiation Facility
Retrieved: Telethon Kids Institute
Retrieved: Universitat Politècnica de València
Retrieved: Institute of Molecular and Cell

Retrieved: German Center for Lung Research
Retrieved: University of Duhok
Retrieved: University Medical Center Hamburg-Eppendorf
Retrieved: University of Lübeck
Retrieved: Iowa State University
Retrieved: Siberian Federal University
Retrieved: All India Institute of Medical Sciences
Retrieved: Max F. Perutz Laboratories
Retrieved: Hospital Universitário Regional do Norte do Paraná
Retrieved: University of Brasília
Retrieved: Cayetano Heredia University
Retrieved: Johns Hopkins University Applied Physics Laboratory
Retrieved: The University of Texas Southwestern Medical Center
Retrieved: Cordis
Retrieved: Délégation Paris 5
Retrieved: University of Porto
Retrieved: Wadsworth Center
Retrieved: Trinity College Dublin
Retrieved: Peter Doherty Institute
Retrieved: UNSW Sydney
Retrieved: Ingham Institute
Retrieved: Northwest University
Retrieved: Ciber
Retrieved: Hospital Clínic de Barcelona
Retrieved: Hospital Universitario Araba
Retrieved: University of Barcelona
Retrieved: Institute of De

Retrieved: Anhui Medical University
Retrieved: HKU-Pasteur Research Pole
Retrieved: Brunel University London
Retrieved: Chemisches und Veterinäruntersuchungsamt Karlsruhe
Retrieved: Baden-Wuerttemberg Cooperative State University
Retrieved: State Museum of Natural History Karlsruhe
Retrieved: Zurich University of Applied Sciences
Retrieved: University of Hohenheim
Retrieved: Karlsruhe Institute of Technology
Retrieved: Hospital for Tropical Diseases
Retrieved: Digital Equipment
Retrieved: Laboratoire de Microbiologie et Génétique Moléculaires
Retrieved: National Heart Lung and Blood Institute
Retrieved: University of Arizona
Retrieved: Institute of Integrative Biology of the Cell
Retrieved: Hong Kong Baptist University
Retrieved: University of Worcester
Retrieved: University of Essex
Retrieved: Philadelphia Department of Public Health
Retrieved: Department of Public Health
Retrieved: Xavier University
Retrieved: Tokyo Metropolitan Institute of Medical Science
Retrieved: Biomedical Prim

Retrieved: Samsung C&T Corporation
Empty name
Retrieved: Gachon University
Retrieved: Otto-von-Guericke University Magdeburg
Retrieved: Qatar Airways
Retrieved: Baylor University
Retrieved: Schering-Plough
Retrieved: Indiana University Bloomington
Retrieved: Colgate University
Retrieved: Vanderbilt University Medical Center
Retrieved: University of Ferrara
Retrieved: Tata Institute of Fundamental Research
Retrieved: Southampton General Hospital
Retrieved: Institut Pasteur de Tunis
Retrieved: University of Maiduguri
Retrieved: Sheba Medical Center
Retrieved: Minnesota Department of Health
Retrieved: Sapporo Medical University
Retrieved: University of Miyazaki
Retrieved: United States Public Health Service
Retrieved: NC Department of Health and Human Services
Retrieved: Anhui Provincial Center for Disease Control and Prevention
Retrieved: Huntsman Cancer Institute
Retrieved: Wake Forest University
Retrieved: University of Foggia
Retrieved: Centro di Riferimento Oncologico della Basilicat

Preprocess the institution data by:

1. renaming
2. find out the main research field
3. replacing special characters

In [23]:
institution_data = institution_df[['display_name', 'relevance_score', 'country_code', 'type', 'cited_by_count', 'x_concepts']]
institution_data = institution_data.rename(columns= {'display_name' : 'institution_name', 'relevance_score': 'institution_relevance_score', 'cited_by_count': 'institution_cited_by_count', 'x_concepts' : 'institution_main_research_field'})

# gain knowledge about the institutions main research field
for i in range(len(institution_data)):
    try:
        institution_data.iloc[i, institution_data.columns.get_loc('institution_main_research_field')] = (list(institution_data['institution_main_research_field'])[i][0]['display_name'])    
    except:
        institution_data.iloc[i, institution_data.columns.get_loc('institution_main_research_field')] = "Unknown"

institution_data = institution_data.reset_index()
institution_data = institution_data.drop(columns = 'index')
        
institution_data.institution_name = institution_data.institution_name.str.replace('[^a-zA-Z]', ' ')
institution_data.institution_name = institution_data.institution_name.str.replace('  ',' ')

  institution_data.institution_name = institution_data.institution_name.str.replace('[^a-zA-Z]', ' ')


In [24]:
institution_data

Unnamed: 0,institution_name,institution_relevance_score,country_code,type,institution_cited_by_count,institution_main_research_field
0,King Abdulaziz University,80447.086,SA,education,1842725,Biology
1,University of Vermont,63553.094,US,education,2799984,Biology
2,University of California Davis,212500.31,US,education,15871016,Biology
3,Washington University in St Louis,338772.06,US,education,25527919,Biology
4,University of South Alabama,42759.277,US,education,761598,Biology
...,...,...,...,...,...,...
1174,Nantong University,28318.2,CN,education,457078,Biology
1175,Taipei Medical University,42372.883,TW,education,907545,Medicine
1176,Minzu University of China,22072.748,CN,education,163392,Biology
1177,University of Science and Technology Beijing,71857.04,CN,education,2214308,Physics


## Combining the dataframes:

Combining the data was a time consuming task due to smaller errors and mistakes.
1. taking the authors from the metadata and replace their special characters
2. taking the authors from the retrieved authors to gain more knowledge about them
3. compare lost data due to merges (e.g. missing values somewhere or wrong name alignments)
    - 3.1 since the metadata is taken document wise, high amounts of authors could be lost. Then, more have to be retrived
4. merging the data 
5. comparing lost data due to merges in terms of adding the institution
6. merging the data again now containing information about the Article, Authors and Institutions

In [57]:
#replace metadata authors 
metadata_authors = split_authors(500)#len(metadata))
#metadata_authors.author = metadata_authors.author.str.replace('[^a-zA-Z]', ' ')
metadata_authors.author = metadata_authors.author.str.replace('  ',' ')
metadata_authors['author'] = metadata_authors['author'].str.lstrip()
metadata_authors.author = metadata_authors.author.str.replace('.', '')

metadata_authors = metadata_authors.rename(columns={'author': 'display_name'})

  author_uid_df = author_uid_df.replace('', np.nan).set_index('cord_uid').stack().reset_index(name='author').drop('level_1',1)
  metadata_authors.author = metadata_authors.author.str.replace('.', '')


In [58]:
# renaming columns and replacing special characters e.g. accent
author_data.display_name = author_data.display_name.str.replace('.', '')
#author_data.last_known_institution = author_data.last_known_institution.str.replace('[^a-zA-Z]', ' ')
author_data.last_known_institution = author_data.last_known_institution.str.replace('  ',' ')

  author_data.display_name = author_data.display_name.str.replace('.', '')


lost data due to name merges:

In [59]:
metadata_authors

Unnamed: 0,cord_uid,display_name,title,license,publish_time,journal
0,ug7v899j,Tariq A Madani,Clinical features of culture-proven Mycoplasma...,no-cc,2001-07-04,BMC Infect Dis
1,ug7v899j,Aisha A Al-Ghamdi,Clinical features of culture-proven Mycoplasma...,no-cc,2001-07-04,BMC Infect Dis
2,02tnwd4m,Albert van der Vliet,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res
3,02tnwd4m,Jason P Eiserich,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res
4,02tnwd4m,Carroll E Cross,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res
...,...,...,...,...,...,...
2910,a18rch1f,Yuan-Yuan Ho,Apolipoprotein M Gene (APOM) Polymorphism Modi...,cc-by,2011-02-24,PLoS One
2911,0jyzk5kt,Lu Zhang,Non-Invasive Microstructure and Morphology Inv...,cc-by,2011-02-25,PLoS One
2912,0jyzk5kt,Dongyue Li,Non-Invasive Microstructure and Morphology Inv...,cc-by,2011-02-25,PLoS One
2913,0jyzk5kt,Shuqian Luo,Non-Invasive Microstructure and Morphology Inv...,cc-by,2011-02-25,PLoS One


In [60]:
metadata_authors['display_name']

0             Tariq A Madani
1          Aisha A Al-Ghamdi
2       Albert van der Vliet
3           Jason P Eiserich
4            Carroll E Cross
                ...         
2910            Yuan-Yuan Ho
2911                Lu Zhang
2912              Dongyue Li
2913             Shuqian Luo
2914          Pauline Schaap
Name: display_name, Length: 2915, dtype: object

In [61]:
author_data['display_name']

0             Tariq A Madani
1          Aisha A Al-Ghamdi
2       Albert van der Vliet
3           Jason P Eiserich
4            Carroll E Cross
                ...         
2849            Yuan Soon Ho
2850                Lu Zhang
2851              Dongyue Li
2852             Shuqian Luo
2853          Pauline Schaap
Name: display_name, Length: 2854, dtype: object

Lost data due to authors: eventually high if split_authors has a higher value than getAuthors.

In [64]:
len(list(set(metadata_authors['display_name']) - set(author_data['display_name'])))

473

In [37]:
#merging our dataframes

combined_data = metadata_authors.merge(author_data, on = 'display_name')
#combined_data2 = combined_data.merge(institution_data, left_on='last_known_institution', right_on = 'institution_name')

In [38]:
combined_data

Unnamed: 0,cord_uid,display_name,title,license,publish_time,journal,relevance_score,works_count,cited_by_count,last_known_institution,research_field
0,ug7v899j,Tariq A Madani,Clinical features of culture-proven Mycoplasma...,no-cc,2001-07-04,BMC Infect Dis,3130.199,88,5115,King Abdulaziz University,Medicine
1,ug7v899j,Aisha A Al-Ghamdi,Clinical features of culture-proven Mycoplasma...,no-cc,2001-07-04,BMC Infect Dis,513.7164,23,102,King Abdulaziz University,Medicine
2,02tnwd4m,Albert van der Vliet,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res,4789.768,190,9210,University of Vermont,Biology
3,02tnwd4m,Jason P Eiserich,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res,4468.0376,99,7951,"University of California, Davis",Biology
4,02tnwd4m,Carroll E Cross,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res,4917.113,300,14172,"University of California, Davis",Biology
...,...,...,...,...,...,...,...,...,...,...,...
3240,n38e01bd,Ping Li,Relative Efficacy of AS03-Adjuvanted Pandemic ...,cc-by-nc-nd,2014-08-15,J Infect Dis,4563.895,1244,17789,China Pharmaceutical University,Biology
3241,aaglh001,Ping Li,Prevalence and Incidence of Respiratory Syncyt...,cc-by-nc-nd,2015-06-01,Clin Infect Dis,4563.895,1244,17789,China Pharmaceutical University,Biology
3242,f29y8vh5,Martha I Nelson,Multiyear Persistence of 2 Pandemic A/H1N1 Inf...,no-cc,2014-07-01,J Infect Dis,1892.6047,98,4176,National Institute of Allergy and Infectious D...,Biology
3243,utigp2vi,Wlodzimierz J Krzyzosiak,RAN translation and frameshifting as translati...,cc-by-nc,2014-10-29,Nucleic Acids Res,1754.6482,149,4199,"Institute of Bioorganic Chemistry, Polish Acad...",Biology


lost data due to institution:

In [39]:
list(set(combined_data['last_known_institution']) - set(institution_data['institution_name']))

["King's College London",
 'National Yang-Ming University',
 'La Jolla Institute For Allergy & Immunology',
 'National Research Council',
 'University of California, Irvine',
 'Universität Hamburg',
 'Instituto Superior Técnico',
 'Centre de Recherche Saint-Antoine',
 'University of California, San Diego',
 'University Hospital Würzburg',
 'Institute of Bioorganic Chemistry, Polish Academy of Sciences',
 'National Centre for Immunisation Research & Surveillance',
 'Universidade de São Paulo',
 'Unité',
 "Children's Hospital of Pittsburgh",
 'Instituto de Investigación Sanitaria del Principado de Asturias',
 "Institut National de l'Énergie Solaire",
 'University of Maryland, Baltimore',
 "Brigham and Women's Hospital",
 'Centre Hospitalier Universitaire de Liège',
 'University of Michigan–Ann Arbor',
 'University of Paris',
 'CEA Fontenay-aux-Roses',
 'University of Würzburg',
 "Johns Hopkins Children's Center",
 'Santaris Pharma a/s',
 'Azienda Ospedaliero-Universitaria Careggi',
 'Wit

In [40]:
#institution_data.institution_name = institution_data.institution_name.str.replace('[^a-zA-Z]', ' ')
#institution_data.institution_name = institution_data.institution_name.str.replace('  ',' ')

In [41]:
combined_data2 = combined_data.merge(institution_data, left_on = 'last_known_institution', right_on = 'institution_name')

In [42]:
combined_data2

Unnamed: 0,cord_uid,display_name,title,license,publish_time,journal,relevance_score,works_count,cited_by_count,last_known_institution,research_field,institution_name,institution_relevance_score,country_code,type,institution_cited_by_count,institution_main_research_field
0,ug7v899j,Tariq A Madani,Clinical features of culture-proven Mycoplasma...,no-cc,2001-07-04,BMC Infect Dis,3130.199,88,5115,King Abdulaziz University,Medicine,King Abdulaziz University,80447.086,SA,education,1842725,Biology
1,ug7v899j,Aisha A Al-Ghamdi,Clinical features of culture-proven Mycoplasma...,no-cc,2001-07-04,BMC Infect Dis,513.7164,23,102,King Abdulaziz University,Medicine,King Abdulaziz University,80447.086,SA,education,1842725,Biology
2,02tnwd4m,Albert van der Vliet,Nitric oxide: a pro-inflammatory mediator in l...,no-cc,2000-08-15,Respir Res,4789.768,190,9210,University of Vermont,Biology,University of Vermont,63553.094,US,education,2799984,Biology
3,ip32y0j5,Sean A Diehl,Generation of Human Antigen-Specific Monoclona...,cc-by,2010-10-04,PLoS One,2366.058,73,3446,University of Vermont,Medicine,University of Vermont,63553.094,US,education,2799984,Biology
4,2b73a28n,Karen A Fagan,Role of endothelin-1 in lung disease,no-cc,2001-02-22,Respir Res,2578.1917,89,4295,University of South Alabama,Medicine,University of South Alabama,42759.277,US,education,761598,Biology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2591,ruba65p5,Kwok-Hung Chan,Validation of Self-swab for Virologic Confirma...,bronze-oa,2011-12-23,The Journal of Infectious Diseases,8407.112,442,33499,Queen Mary Hospital,Medicine,Queen Mary Hospital,66684.04,CN,healthcare,1334568,Medicine
2592,zshd7i9i,Nicolò Patroniti,Extracorporeal membrane oxygenation (ECMO) in ...,cc-by,2013-02-13,Crit Care,2061.433,185,5321,University of Genoa,Medicine,University of Genoa,100375.67,IT,education,4467840,Biology
2593,q8gshba8,Myriam Arévalo-Herrera,Plasmodium vivax Antigen Discovery Based on Al...,cc-by,2014-06-24,PLoS One,1462.812,148,3283,Caucaseco Scientific Research Center,Medicine,Caucaseco Scientific Research Center,4331.717,CO,facility,4617,Medicine
2594,n38e01bd,Ping Li,Relative Efficacy of AS03-Adjuvanted Pandemic ...,cc-by-nc-nd,2014-08-15,J Infect Dis,4563.895,1244,17789,China Pharmaceutical University,Biology,China Pharmaceutical University,53384.145,CN,education,1367300,Biology


## Retrieving Journal information

In [43]:
journal_list = combined_data['journal'].unique()

In [44]:
journal_list

array(['BMC Infect Dis', 'Respir Res', 'Virol J', 'The EMBO Journal',
       'PLoS Pathog', 'Crit Care',
       'Journal of the American Medical Informatics Association',
       'BMC Public Health', 'EMBO J', 'Nucleic Acids Res', 'EMBO Rep',
       'Biol Proced Online', 'J Transl Med', 'Retrovirology', 'PLoS One',
       'BMC Genomics', 'Aust New Zealand Health Policy',
       'Microb Cell Fact', 'Evol Bioinform Online', 'J Biomed Biotechnol',
       'BMC Gastroenterol', 'PLoS Biol', 'Int J Health Geogr',
       'Biochemistry', 'Immunome Res', 'PLoS Comput Biol', 'BMC Mol Biol',
       'EMBO reports', 'PLoS Med', 'BMC Biotechnol', 'Harm Reduct J',
       'Nat Med', 'PLoS Negl Trop Dis', 'Clin Infect Dis',
       'BMC Med Ethics', 'Evid Based Complement Alternat Med',
       'World Allergy Organ J', 'BMJ Open', 'Emerg Infect Dis',
       'Ann Clin Microbiol Antimicrob', 'Methods', 'Int J Inflam',
       'Reprod Biol Endocrinol', 'Theor Biol Med Model',
       'Clinical Chemistry', 'BMC 

In [45]:
journal_df = pd.DataFrame()

for i in range(len(journal_list)):
    try:
        print(journal_list[i])
        journal_information = requests.get(
            'https://api.openalex.org/venues?filter=display_name.search:'+journal_list[i]
        ).json()['results'][0]
        current_journal = pd.DataFrame.from_dict(journal_information, orient='index')
        current_journal = current_journal.transpose()
        journal_df = journal_df.append(current_journal)
    except:
        print("Empty name")
    #author_data = author_data.append(pd.json_normalize(author_information)[['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution.display_name', 'x_concepts']])
    #author_data[i] = author_information['x_concepts'][1]['display_name']
    
    #author_data.append(pd.json_normalize(tariq)[['display_name', 'relevance_score', 'works_count', 'cited_by_count', 'last_known_institution.display_name', 'x_concepts']])


BMC Infect Dis
Empty name
Respir Res
Empty name
Virol J
The EMBO Journal
PLoS Pathog
Empty name
Crit Care
Journal of the American Medical Informatics Association
BMC Public Health
EMBO J
Empty name
Nucleic Acids Res
Empty name
EMBO Rep
Empty name
Biol Proced Online
Empty name
J Transl Med
Empty name
Retrovirology
PLoS One
BMC Genomics
Aust New Zealand Health Policy
Empty name
Microb Cell Fact
Empty name
Evol Bioinform Online
Empty name
J Biomed Biotechnol
Empty name
BMC Gastroenterol
Empty name
PLoS Biol
Empty name
Int J Health Geogr
Empty name
Biochemistry
Immunome Res
Empty name
PLoS Comput Biol
Empty name
BMC Mol Biol
Empty name
EMBO reports
PLoS Med
Empty name
BMC Biotechnol
Empty name
Harm Reduct J
Empty name
Nat Med
Empty name
PLoS Negl Trop Dis
Empty name
Clin Infect Dis
BMC Med Ethics
Empty name
Evid Based Complement Alternat Med
Empty name
World Allergy Organ J
Empty name
BMJ Open
Emerg Infect Dis
Empty name
Ann Clin Microbiol Antimicrob
Empty name
Methods
Int J Inflam
Empty n

In [46]:
journal_df.head()

Unnamed: 0,id,issn_l,issn,display_name,publisher,relevance_score,works_count,cited_by_count,is_oa,is_in_doaj,...,country_code,societies,alternate_titles,abbreviated_title,type,x_concepts,counts_by_year,works_api_url,updated_date,created_date
0,https://openalex.org/V4306514259,,,Int J Virol,,393.81268,319,319,,,...,,[],[],,journal,"[{'id': 'https://openalex.org/C71924100', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=host_ven...,2022-12-31T01:55:50.348757,2022-10-17
0,https://openalex.org/V127916151,0261-4189,"[1460-2075, 0261-4189]",The EMBO Journal,Nature Portfolio,85738.17,20945,2655687,False,False,...,DE,[],[European Molecular Biology Organization journ...,,journal,"[{'id': 'https://openalex.org/C86803240', 'wik...","[{'year': 2023, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=host_ven...,2023-01-02T00:57:33.179260,2016-06-24
0,https://openalex.org/V4306506051,,,Chin Crit Care Med,,3055.4382,3090,3098,,,...,,[],[],,journal,"[{'id': 'https://openalex.org/C41008148', 'wik...","[{'year': 2022, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=host_ven...,2022-12-31T00:52:47.324380,2022-10-17
0,https://openalex.org/V129839026,1067-5027,"[1067-5027, 1527-974X]",Journal of the American Medical Informatics As...,Oxford University Press,30164.291,4383,199861,False,False,...,GB,[],[JAMIA],,journal,"[{'id': 'https://openalex.org/C41008148', 'wik...","[{'year': 2023, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=host_ven...,2023-01-02T09:49:55.000914,2016-06-24
0,https://openalex.org/V200437886,1471-2458,[1471-2458],BMC Public Health,BioMed Central,43231.438,20950,516088,True,True,...,GB,[],"[BioMed Central public health, Public health]",,journal,"[{'id': 'https://openalex.org/C71924100', 'wik...","[{'year': 2023, 'works_count': 0, 'cited_by_co...",https://api.openalex.org/works?filter=host_ven...,2023-01-01T20:03:01.710174,2016-06-24


In [47]:
journal_data = journal_df[['issn', 'display_name', 'publisher', 'relevance_score', 'cited_by_count', 'cited_by_count','x_concepts']]
journal_data = journal_data.rename(columns = {'display_name':'journal_display_name', 'publisher': 'journal_publisher', 'relevance_score': 'journal_relevance_score', 'cited_by_count' : 'journal_cited_count', 'x_concepts' : 'main_research_field'})

In [48]:
journal_data

Unnamed: 0,issn,journal_display_name,journal_publisher,journal_relevance_score,journal_cited_count,journal_cited_count.1,main_research_field
0,,Int J Virol,,393.81268,319,319,"[{'id': 'https://openalex.org/C71924100', 'wik..."
0,"[1460-2075, 0261-4189]",The EMBO Journal,Nature Portfolio,85738.17,2655687,2655687,"[{'id': 'https://openalex.org/C86803240', 'wik..."
0,,Chin Crit Care Med,,3055.4382,3098,3098,"[{'id': 'https://openalex.org/C41008148', 'wik..."
0,"[1067-5027, 1527-974X]",Journal of the American Medical Informatics As...,Oxford University Press,30164.291,199861,199861,"[{'id': 'https://openalex.org/C41008148', 'wik..."
0,[1471-2458],BMC Public Health,BioMed Central,43231.438,516088,516088,"[{'id': 'https://openalex.org/C71924100', 'wik..."
0,[1742-4690],Retrovirology,BioMed Central,11502.022,59152,59152,"[{'id': 'https://openalex.org/C71924100', 'wik..."
0,[1932-6203],PLOS ONE,Public Library of Science,191888.27,7313220,7313220,"[{'id': 'https://openalex.org/C86803240', 'wik..."
0,[1471-2164],BMC Genomics,Springer Science+Business Media,42423.34,543264,543264,"[{'id': 'https://openalex.org/C86803240', 'wik..."
0,"[1520-4995, 1943-295X, 0006-2960]",Biochemistry,American Chemical Society,55881.723,3552476,3552476,"[{'id': 'https://openalex.org/C185592680', 'wi..."
0,"[1469-221X, 1469-3178]",EMBO Reports,Nature Portfolio,32648.39,270058,270058,"[{'id': 'https://openalex.org/C86803240', 'wik..."


In [49]:
for i in range(len(journal_data)):
    try:
        journal_data.iloc[i, journal_data.columns.get_loc('main_research_field')] = (list(journal_data['main_research_field'])[i][0]['display_name'])    
    except:
        journal_data.iloc[i, journal_data.columns.get_loc('main_research_field')] = "Unknown"

Merging the journal data now to our complete dataframe

In [50]:
# renaming columns and replacing special characters e.g. accent
journal_data.journal_display_name = journal_data.journal_display_name.str.replace('.', '')
#journal_data.journal_display_name = journal_data.journal_display_name.str.replace('[^a-zA-Z]', ' ')
#journal_data.journal_display_name = journal_data.journal_display_name.str.replace('  ',' ')

  journal_data.journal_display_name = journal_data.journal_display_name.str.replace('.', '')


In [51]:
list(set(combined_data2['journal']) - set(journal_data['journal_display_name']))

['J Biomed Biotechnol',
 'Open Forum Infect Dis',
 'Harm Reduct J',
 'BMC Gastroenterol',
 'Environ Health Prev Med',
 'BMC Med',
 'Influenza Other Respir Viruses',
 'J Transl Med',
 'BMC Mol Biol',
 'Environ Health',
 'PLoS Comput Biol',
 'Nat Commun',
 'PLoS Genet',
 'Cytokine & Growth Factor Reviews',
 'Cell Microbiol',
 'Anal Bioanal Chem',
 'BMC Biotechnol',
 'EMBO Rep',
 'BMC Biol',
 'Crit Care',
 'Int J Environ Res Public Health',
 'Cases J',
 'PLoS Biol',
 'Br J Cancer',
 'Cell Res',
 'Naunyn Schmiedebergs Arch Pharmacol',
 'Diagn Pathol',
 'Epidemiol Perspect Innov',
 'BMC Pregnancy Childbirth',
 'mBio',
 'Isr J Health Policy Res',
 'Hum Resour Health',
 'Theor Biol Med Model',
 'Mediators Inflamm',
 'BMC Med Genomics',
 'Pediatr Res',
 'Ital J Pediatr',
 'J Med Case Reports',
 'Int J Mol Sci',
 'J Cell Biol',
 'BMC Pediatr',
 'Emerg Infect Dis',
 'PLoS Curr',
 'Dev Immunol',
 'Immunome Res',
 'Vet Res',
 'Biol Proced Online',
 'BMC Res Notes',
 'Vet Med Int',
 'Open Virol J',

In [52]:
combined_data3 = combined_data2.merge(journal_data, left_on = 'journal', right_on = 'journal_display_name')
combined_data3 = combined_data3.loc[:,~combined_data3.columns.duplicated()]

### Our completed Dataframe consisting of: 



In [53]:
combined_data3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 225 entries, 0 to 224
Data columns (total 23 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   cord_uid                         225 non-null    object
 1   display_name                     225 non-null    object
 2   title                            225 non-null    object
 3   license                          225 non-null    object
 4   publish_time                     225 non-null    object
 5   journal                          225 non-null    object
 6   relevance_score                  224 non-null    object
 7   works_count                      225 non-null    object
 8   cited_by_count                   225 non-null    object
 9   last_known_institution           225 non-null    object
 10  research_field                   225 non-null    object
 11  institution_name                 225 non-null    object
 12  institution_relevance_score      225

In [54]:
pwd = %pwd

In [56]:
combined_data3.to_csv(pwd+'full_dataframe_500.csv', index=False, header=True)
