In [1]:
import pandas as pd
import re
import requests
import time

In [2]:
df = pd.read_csv("biorxiv.csv")

In [None]:
#Affiliatin_nums: Replace NAs with 1, for single affiliation.
#Affiliation_nums: Split into list
df.affiliation_nums = df.affiliation_nums.fillna('1')
df.affiliation_nums = df.affiliation_nums.apply(str.split,sep=',')

In [None]:
#Affiliation_text: Replace NAs with empty string
#Run clean_affiliation, to reformat affiliations
def clean_affiliation(x):
    x = x.split(';')
    for i in range(len(x)):
        x[i] = re.sub('(\\n)(\\t)+','',x[i])
        x[i] = x[i].strip()
        x[i] = re.sub('^,+','',x[i])
        x[i] = x[i].strip()
    return(x)
df.affiliation_text = df.affiliation_text.fillna('')
df.affiliation_text = df.affiliation_text.apply(clean_affiliation)

In [None]:
#Cleaning number of downloads
def clean_downloads(x):
    x = x.split(',')
    x = map(lambda y: re.sub('<tr class="odd">','',y),x)
    x = map(lambda y: re.sub('<tr class="even">','',y),x)
    x = map(lambda y: re.sub(' </tr>','',y),x)
    x = list(map(lambda y: y.split('</td>'),x))
    for i in range(len(x)):
        x[i] = list(map(lambda y: re.sub('^<td>','',y),x[i]))
    return x
df.downloads = df.downloads.fillna('')
df.downloads = df.downloads.apply(clean_downloads)

In [None]:
#Cleaning date_posted, converting to datetime
tmp = df.date_posted
tmp = list(map(lambda x: x[12:],tmp))
tmp = list(map(lambda x: x[:-3],tmp))
df.date_posted = pd.Series(tmp)
df.date_posted = pd.to_datetime(df.date_posted)

In [None]:
def get_pubmed_id(authors):
    base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    esearch= 'esearch.fcgi?'
    db= 'db=pubmed&'
    field= 'field=author&'
    term_prefix = 'term='
    and_syntax = '+AND+'
    term_suffix = '&'
    api = 'api_key=56a5fd6220202c3f7c1c924f215c4188b708'
    
    if authors=='':
        return "No authors"
    authors = authors.lower()
    authors = authors.split(',')
    
    if len(authors) > 2:
        term = re.sub(' ','+',authors[0])+and_syntax+re.sub(' ','+',authors[1])+and_syntax+re.sub(' ','+',authors[2])
    elif len(authors) > 1:
        term = re.sub(' ','+',authors[0])+and_syntax+re.sub(' ','+',authors[1])
    else:
        term = re.sub(' ','+',authors[0])
        
    url = base+esearch+db+field+term_prefix+term+term_suffix+api
    
    response=requests.get(url)
    
    return response.text

In [None]:
def parse_pmid(xml):
    if xml=="No authors":
        return [xml]
    test_if_error =  re.search('error',xml[:9])
    if (test_if_error!=None):
        return [xml]
    tmp = re.split('<Count>',xml)
    tmp = re.split('</Count>',tmp[1])
    count = int(tmp[0])
    if count ==0:
        return [count,""]
    else:
        tmp = re.split('<IdList>',xml)
        tmp = re.split('</IdList>',tmp[1])
        tmp = re.split('</Id>',tmp[0])
        tmp = map(lambda x: re.sub('<Id>','',x),tmp)
        tmp = list(map(lambda x: re.sub('\n','',x),tmp))
        return [count,*tmp]

In [None]:
#Obtain authors from the data frame and query Entrez for PMIDs
author_list = list(df.author_names.fillna('').values)
pmid_result = []
for i in range(len(author_list)):
    tmp = get_pubmed_id(author_list[i])
    pmid_result.append(tmp)
    time.sleep(0.2)
    
#Optionally write result to CSV
#tmp = pd.Series(pmid_result)
#tmp.to_csv('pmids.csv')


In [None]:
#Parse raw PMID returns into count of hits, and PMID numbers. Add as new column to df
parsed_pmid = list(map(parse_pmid,pmid_result))
df['pmid']=pd.Series(parsed_pmid)

#Subset the data on just results that have one hit for a PMID, meaning a one to one mapping of authors to PMID
#Assign values to dataframe called df_subset
subset_vec = []
for i in range(len(df.pmid)):
    if len(df.pmid[i])==3:
        subset_vec.append(True)
    else:
        subset_vec.append(False)
subset_indices = df.pmid[subset_vec].index
df_subset = df.iloc[subset_indices]
df_subset.index = range(1432)
df_subset.pmid = df_subset.pmid.apply(lambda x: x[1])

In [None]:
def get_pmid_links(pmid_list):
    
    base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    elink= 'elink.fcgi?'
    db= 'dbfrom=pubmed&linkname=pubmed_pubmed_citedin&'
    id_prefix = 'id='
    id_suffix = '&'
    api = 'api_key=56a5fd6220202c3f7c1c924f215c4188b708'
    
    result = []
    
    sub_query = ''
    for i in range(len(pmid_list)):
        sub_query = sub_query + id_prefix + pmid_list[i] + id_suffix
    url = base+elink+db+sub_query+api
    response=requests.get(url)
    queries = re.split('<LinkSet>',response.text)[1:]
    for i in range(len(queries)):
        if re.search('<LinkSetDb>',queries[i])==None:
            result.append([])
        else:
            links = re.split('<Link>\n\t\t\t\t<Id>',queries[i])[1:]
            links = list(map(lambda x: x[:8],links))
            result.append(links)
    return result

In [None]:
#From subsetted data, query the Entrez linking database to get number of citations per paper
pmid_list = list(df_subset.pmid.values)
link_list = []
for i in range(14):
    j = 0 + (100*i)
    k = 100 + (100*i)
    link_list.append(get_pmid_links(pmid_list[j:k]))
    time.sleep(1)
link_list.append(get_pmid_links(pmid_list[1400:1432]))
link_list_tmp =[]
for i in range(len(link_list)):
    for j in range(len(link_list[i])):
        link_list_tmp.append(link_list[i][j])
df_subset['pmid_links'] = pd.Series(link_list_tmp)
df_subset['pmid_links_num'] = df_subset.pmid_links.apply(len)

In [None]:
def get_total_downloads(paper):
    result = 0
    if paper[0]==['']:
        return result
    for i in range(len(paper)):
        result += int(paper[i][1])
    return result
df_subset['total_downloads']=df_subset.downloads.apply(get_total_downloads)

In [None]:
df_subset.to_csv('biorxiv-cleaned.csv')

In [3]:
df

Unnamed: 0,affiliation_nums,affiliation_text,author_names,date_posted,doi,downloads,paper_title,subject_area
0,1,"\n\t\t\t\t, University of Southern California\...","James H Joly,Brandon TL Chew,Nicholas Alexande...","\n Posted October 07, 2020.",https://doi.org/10.1101/2020.10.05.327429,"<tr class=""odd""><td>Oct 2020</td><td>500</td><...",The landscape of metabolic pathway dependencie...,"Cancer Biology,\n"
1,"1,3,4,1,2,3,2,3,2,3,2,2,3,2,3,2,3,1,3,*",", Glasgow G61 1QH, United Kingdom,, Glasgow G6...","Mate Naszai,Yachuan Yu,Alvaro R Fernandez,Emma...","\n Posted October 08, 2020.",https://doi.org/10.1101/2020.10.07.329607,"<tr class=""odd""><td>Oct 2020</td><td>343</td><...",RAL GTPases mediate EGFR/MAPK signalling-drive...,"Cancer Biology,\n"
2,1423322,"\n\t\t\t\t, H. Lee Moffitt Cancer Center & Res...","Jeffrey West,Maximilian Strobl,Cole Armagost,R...","\n Posted October 09, 2020.",https://doi.org/10.1101/2020.10.08.331678,"<tr class=""odd""><td>Oct 2020</td><td>280</td><...",Antifragile therapy,"Cancer Biology,\n"
3,"1,10,1,1,1,1,2,1,1,1,3,3,4,4,4,5,1,3,3,3,3,3,1...","\n\t\t\t\t, University of Cambridge;\n\t\t\t,\...","Filipe Correia Martins,Dominique-Laurent Coutu...","\n Posted October 12, 2020.",https://doi.org/10.1101/2020.10.04.325365,"<tr class=""odd""><td>Oct 2020</td><td>768</td><...",Somatic chromosomal number alterations affecti...,"Cancer Biology,\n"
4,123145467198,"\n\t\t\t\t, Helen Diller Comprehensive Cancer ...","Andrew L Wolfe,Qingwen Zhou,Eneda Toska,Jacque...","\n Posted October 13, 2020.",https://doi.org/10.1101/2020.10.13.337998,"<tr class=""odd""><td>Oct 2020</td><td>353</td><...","UDP-glucose pyrophosphorylase 2, a regulator o...","Cancer Biology,\n"
...,...,...,...,...,...,...,...,...
3634,"1,+,#,1,2,3,4,2,5,1,*",", Tucson, Arizona,, Tucson, Arizona,, Tucson, ...","Brenna A. Rheinheimer,Ronald L. Heimark,Adam D...","\n Posted September 30, 2020.",https://doi.org/10.1101/2020.09.29.318873,"<tr class=""odd""><td>Sep 2020</td><td>194</td><...","Cell intrinsic signaling in , mutant pancreati...","Cancer Biology,\n"
3635,"1,2,1,1,1,3,4,1,1,1,1,1,*,5,6,*,1,*",,"Lei Huang,Xiao-Ou Zhang,Odette Verdejo-Torres,...","\n Posted September 30, 2020.",https://doi.org/10.1101/2020.08.12.246660,"<tr class=""odd""><td>Aug 2020</td><td>703</td><...",Protein arginine methyltransferase 5 promotes ...,"Cancer Biology,\n"
3636,"1,5,2,5,2,5,2,1,5,3,1,4,1,5,*",", Houston TX,, Houston TX,, Houston TX,, Houst...","Emilly S Villodre,Yun Gong,Lei Huo,Esther C Yo...","\n Posted September 28, 2020.",https://doi.org/10.1101/2020.09.25.313817,"<tr class=""odd""><td>Sep 2020</td><td>372</td><...",NDRG1 expression is an independent prognostic ...,"Cancer Biology,\n"
3637,"1,#,1,#,2,1,3,3,3,3,3,*,1,*",", Beijing 100037, China,, Chengdu 610041, Chin...","Xuanyu Liu,Meng Yuan,Qinqin Xiang,Wen Chen,Zhu...","\n Posted September 28, 2020.",https://doi.org/10.1101/2020.09.27.315911,"<tr class=""odd""><td>Sep 2020</td><td>248</td><...",Single-cell RNA-seq of the stromal vascular fr...,"Cancer Biology,\n"
