In [21]:
import pandas as pd
import re
import requests
import time

In [273]:
df = pd.read_csv("biorxiv.csv")

In [274]:
#Affiliatin_nums: Replace NAs with 1, for single affiliation.
#Affiliation_nums: Split into list
df.affiliation_nums = df.affiliation_nums.fillna('1')
df.affiliation_nums = df.affiliation_nums.apply(str.split,sep=',')

In [275]:
#Affiliation_text: Replace NAs with empty string
#Run clean_affiliation, to 
def clean_affiliation(x):
    x = x.split(';')
    for i in range(len(x)):
        x[i] = re.sub('(\\n)(\\t)+','',x[i])
        x[i] = x[i].strip()
        x[i] = re.sub('^,+','',x[i])
        x[i] = x[i].strip()
    return(x)
df.affiliation_text = df.affiliation_text.fillna('')
df.affiliation_text = df.affiliation_text.apply(clean_affiliation)

In [276]:
#Cleaning number of downloads
def clean_downloads(x):
    x = x.split(',')
    x = map(lambda y: re.sub('<tr class="odd">','',y),x)
    x = map(lambda y: re.sub('<tr class="even">','',y),x)
    x = map(lambda y: re.sub(' </tr>','',y),x)
    x = list(map(lambda y: y.split('</td>'),x))
    for i in range(len(x)):
        x[i] = list(map(lambda y: re.sub('^<td>','',y),x[i]))
    return x
df.downloads = df.downloads.fillna('')
df.downloads = df.downloads.apply(clean_downloads)

In [305]:
def get_pubmed_id(authors):
    base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    esearch= 'esearch.fcgi?'
    db= 'db=pubmed&'
    field= 'field=author&'
    term_prefix = 'term='
    and_syntax = '+AND+'
    term_suffix = '&'
    api = 'api_key=56a5fd6220202c3f7c1c924f215c4188b708'
    
    if authors=='':
        return "No authors"
    authors = authors.lower()
    authors = authors.split(',')
    
    if len(authors) > 2:
        term = re.sub(' ','+',authors[0])+and_syntax+re.sub(' ','+',authors[1])+and_syntax+re.sub(' ','+',authors[2])
    elif len(authors) > 1:
        term = re.sub(' ','+',authors[0])+and_syntax+re.sub(' ','+',authors[1])
    else:
        term = re.sub(' ','+',authors[0])
        
    url = base+esearch+db+field+term_prefix+term+term_suffix+api
    
    response=requests.get(url)
    
    return response.text

In [None]:
def parse_pmid(xml):
    if xml=="No authors":
        return [xml]
    test_if_error =  re.search('error',xml[:9])
    if (test_if_error!=None):
        return [xml]
    tmp = re.split('<Count>',xml)
    tmp = re.split('</Count>',tmp[1])
    count = int(tmp[0])
    if count ==0:
        return [count,""]
    else:
        tmp = re.split('<IdList>',xml)
        tmp = re.split('</IdList>',tmp[1])
        tmp = re.split('</Id>',tmp[0])
        tmp = map(lambda x: re.sub('<Id>','',x),tmp)
        tmp = list(map(lambda x: re.sub('\n','',x),tmp))
        return [count,*tmp]

In [None]:
#Obtain authors from the data frame and query Entrez for PMIDs
author_list = list(df.author_names.fillna('').values)
pmid_result = []
for i in range(len(author_list)):
    tmp = get_pubmed_id(author_list[i])
    pmid_result.append(tmp)
    time.sleep(0.2)
    
#Optionally write result to CSV
#tmp = pd.Series(pmid_result)
#tmp.to_csv('pmids.csv')

#Parse raw PMID returns into count of hits, and PMID numbers. Add as new column to df
parsed_pmid = list(map(parse_pmid,pmid_result))
df['pmid']=pd.Series(parsed_pmid)

#Subset the data on just results that have one hit for a PMID, meaning a one to one mapping of authors to PMID
#Assign values to dataframe called df_subset
subset_vec = []
for i in range(len(df.pmid)):
    if len(df.pmid[i])==3:
        subset_vec.append(True)
    else:
        subset_vec.append(False)
subset_indices = df.pmid[subset_vec].index
df_subset = df.iloc[subset_indices]