In [480]:
import pandas as pd
import re
import requests
import time

In [481]:
df = pd.read_csv("biorxiv.csv")

In [482]:
#Affiliatin_nums: Replace NAs with 1, for single affiliation.
#Affiliation_nums: Split into list
df.affiliation_nums = df.affiliation_nums.fillna('1')
df.affiliation_nums = df.affiliation_nums.apply(str.split,sep=',')

In [483]:
#Affiliation_text: Replace NAs with empty string
#Run clean_affiliation, to 
def clean_affiliation(x):
    x = x.split(';')
    for i in range(len(x)):
        x[i] = re.sub('(\\n)(\\t)+','',x[i])
        x[i] = x[i].strip()
        x[i] = re.sub('^,+','',x[i])
        x[i] = x[i].strip()
    return(x)
df.affiliation_text = df.affiliation_text.fillna('')
df.affiliation_text = df.affiliation_text.apply(clean_affiliation)

In [484]:
#Cleaning number of downloads
def clean_downloads(x):
    x = x.split(',')
    x = map(lambda y: re.sub('<tr class="odd">','',y),x)
    x = map(lambda y: re.sub('<tr class="even">','',y),x)
    x = map(lambda y: re.sub(' </tr>','',y),x)
    x = list(map(lambda y: y.split('</td>'),x))
    for i in range(len(x)):
        x[i] = list(map(lambda y: re.sub('^<td>','',y),x[i]))
    return x
df.downloads = df.downloads.fillna('')
df.downloads = df.downloads.apply(clean_downloads)

In [485]:
def get_pubmed_id(authors):
    base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    esearch= 'esearch.fcgi?'
    db= 'db=pubmed&'
    field= 'field=author&'
    term_prefix = 'term='
    and_syntax = '+AND+'
    term_suffix = '&'
    api = 'api_key=56a5fd6220202c3f7c1c924f215c4188b708'
    
    if authors=='':
        return "No authors"
    authors = authors.lower()
    authors = authors.split(',')
    
    if len(authors) > 2:
        term = re.sub(' ','+',authors[0])+and_syntax+re.sub(' ','+',authors[1])+and_syntax+re.sub(' ','+',authors[2])
    elif len(authors) > 1:
        term = re.sub(' ','+',authors[0])+and_syntax+re.sub(' ','+',authors[1])
    else:
        term = re.sub(' ','+',authors[0])
        
    url = base+esearch+db+field+term_prefix+term+term_suffix+api
    
    response=requests.get(url)
    
    return response.text

In [486]:
def parse_pmid(xml):
    if xml=="No authors":
        return [xml]
    test_if_error =  re.search('error',xml[:9])
    if (test_if_error!=None):
        return [xml]
    tmp = re.split('<Count>',xml)
    tmp = re.split('</Count>',tmp[1])
    count = int(tmp[0])
    if count ==0:
        return [count,""]
    else:
        tmp = re.split('<IdList>',xml)
        tmp = re.split('</IdList>',tmp[1])
        tmp = re.split('</Id>',tmp[0])
        tmp = map(lambda x: re.sub('<Id>','',x),tmp)
        tmp = list(map(lambda x: re.sub('\n','',x),tmp))
        return [count,*tmp]

In [None]:
#Obtain authors from the data frame and query Entrez for PMIDs
author_list = list(df.author_names.fillna('').values)
pmid_result = []
for i in range(len(author_list)):
    tmp = get_pubmed_id(author_list[i])
    pmid_result.append(tmp)
    time.sleep(0.2)
    
#Optionally write result to CSV
#tmp = pd.Series(pmid_result)
#tmp.to_csv('pmids.csv')


In [487]:
#Parse raw PMID returns into count of hits, and PMID numbers. Add as new column to df
parsed_pmid = list(map(parse_pmid,pmid_result))
df['pmid']=pd.Series(parsed_pmid)

#Subset the data on just results that have one hit for a PMID, meaning a one to one mapping of authors to PMID
#Assign values to dataframe called df_subset
subset_vec = []
for i in range(len(df.pmid)):
    if len(df.pmid[i])==3:
        subset_vec.append(True)
    else:
        subset_vec.append(False)
subset_indices = df.pmid[subset_vec].index
df_subset = df.iloc[subset_indices]
df_subset.index = range(1432)
df_subset.pmid = df_subset.pmid.apply(lambda x: x[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [488]:
def get_pmid_links(pmid_list):
    
    base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    elink= 'elink.fcgi?'
    db= 'dbfrom=pubmed&linkname=pubmed_pubmed_citedin&'
    id_prefix = 'id='
    id_suffix = '&'
    api = 'api_key=56a5fd6220202c3f7c1c924f215c4188b708'
    
    result = []
    
    sub_query = ''
    for i in range(len(pmid_list)):
        sub_query = sub_query + id_prefix + pmid_list[i] + id_suffix
    url = base+elink+db+sub_query+api
    response=requests.get(url)
    queries = re.split('<LinkSet>',response.text)[1:]
    for i in range(len(queries)):
        if re.search('<LinkSetDb>',queries[i])==None:
            result.append([])
        else:
            links = re.split('<Link>\n\t\t\t\t<Id>',queries[i])[1:]
            links = list(map(lambda x: x[:8],links))
            result.append(links)
    return result

In [493]:
#From subsetted data, query the Entrez linking database to get number of citations per paper
pmid_list = list(df_subset.pmid.values)
link_list = []
for i in range(14):
    j = 0 + (100*i)
    k = 100 + (100*i)
    link_list.append(get_pmid_links(pmid_list[j:k]))
    time.sleep(1)
link_list.append(get_pmid_links(pmid_list[1400:1432]))
link_list_tmp =[]
for i in range(len(link_list)):
    for j in range(len(link_list[i])):
        link_list_tmp.append(link_list[i][j])
df_subset['pmid_links'] = pd.Series(link_list_tmp)
df_subset['pmid_links_num'] = df_subset.pmid_links.apply(len)

In [508]:
df_subset

Unnamed: 0,affiliation_nums,affiliation_text,author_names,date_posted,doi,downloads,paper_title,subject_area,pmid,pmid_links,pmid_links_num
0,"[1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 3, 1, 2, 1, ...","[Yale University, Central South University, Ya...","Meiling Zhang,Zongzhi Liu,Keisuke Aoshima,Yang...","\n Posted October 23, 2020.",https://doi.org/10.1101/2020.09.10.291799,"[[Sep 2020, 218, 29, 45, ], [Oct 2020, 227, 19...",CECR2 Drives Breast Cancer Metastasis by Suppr...,"Cancer Biology,\n",31690671,[],0
1,"[1, 3, 2]","[shanghaiTech University, ShanghaiTech Univers...","Tian Chi,Mingming Jia","\n Posted October 27, 2020.",https://doi.org/10.1101/2020.10.26.356220,"[[Oct 2020, 222, 37, ]]",Sex-specific predictive values of biomarkers f...,"Cancer Biology,\n",32154170,[32449246],1
2,"[1, 1, 2, 3, 4, 2, 1, 5, *]","[Kanpur, 208016, U.P., INDIA,, Detroit, MI 482...","Sakshi Goel,Vipul Bhatia,Shannon Carskadon,Nil...","\n Posted August 29, 2020.",https://doi.org/10.1101/2020.08.28.271916,"[[Aug 2020, 552, 8, 116, ], [Sep 2020, 362, 41...",Transcriptional network involving ERG and AR o...,"Cancer Biology,\n",30587549,"[33043165, 32896760, 32266486, 32126023, 31977...",9
3,"[1, 2, 3, 4, 1, 1, *]","[Tampa, FL, USA,, Oxford, UK,, Birmingham, AL,...","Jacob G. Scott,Anita B. Hjelmeland,Prakash Chi...","\n Posted November 07, 2013.",https://doi.org/10.1101/000141,"[[Nov 2013, 248, 0, 27, ], [Dec 2013, 140, 0, ...",Microenvironmental variables must influence in...,"Cancer Biology,\n",24453958,"[30992437, 29392399, 29351275, 28669884, 28508...",18
4,"[1, 2, 3]","[CUNY, The Graduate Center, New York, 365 Fift...","Loes Olde Loohuis,Andreas Witzel,Bud Mishra","\n Posted January 02, 2014.",https://doi.org/10.1101/001651,"[[Jan 2014, 171, 0, 99, ], [Feb 2014, 37, 0, 2...",Power-law Null Model for Bystander Mutations i...,"Cancer Biology,\n",26357061,[],0
...,...,...,...,...,...,...,...,...,...,...,...
1427,"[1, 2, #, 1, #, 1, 1, 2, 3, 4, 6, 5, 5, 6, 1, *]","[Atlanta, GA USA,, Atlanta, GA, USA,, Atlanta,...","Briana Rackley,Chang-Soo Seong,Evan Kiely,Rebe...","\n Posted September 25, 2020.",https://doi.org/10.1101/2020.09.25.308080,"[[Sep 2020, 437, 19, 58, ], [Oct 2020, 121, 66...",The level of oncogenic Ras controls the malign...,"Cancer Biology,\n",32550498,[],0
1428,"[1, 1, 1, 1, 1, 1, 2, 3, 1, 3, *]","[Richmond, VA, 23298,, Richmond VA 23298,, Ric...","Christian T Fontan,Dipon Das,Molly L Bristol,C...","\n Posted September 26, 2020.",https://doi.org/10.1101/2020.09.25.314484,"[[Sep 2020, 363, 10, 41, ], [Oct 2020, 101, 11...",Human papillomavirus 16 E2 repression of TWIST...,"Cancer Biology,\n",31915229,"[33066318, 32938703, 32899142, 32545729, 32316...",6
1429,"[1, 1, 1, 1, 1, 1, 2, 1, 3, 4, 1, *]","[Columbus OH,, Ann Arbor, MI, 48109,, San Anto...","Safiya Khurshid,Matias Montes,Daniel F. Comisk...","\n Posted October 02, 2020.",https://doi.org/10.1101/2020.10.02.324053,"[[Oct 2020, 361, 18, 61, ]]",Splice-switching of the insulin receptor in rh...,"Cancer Biology,\n",31662450,[],0
1430,"[1, #, *, 15, 1, 2, #, 1, 3, 4, 5, 1, 1, 1, 1,...","[Marseille, France,, Paris, France,, La Tronch...","Sonia Brun,Eric Raymond,Firas Bassissi,Zuzana ...","\n Posted October 01, 2020.",https://doi.org/10.1101/2020.09.30.320010,"[[Oct 2020, 277, 52, 71, ]]","GNS561, a clinical-stage PPT1 inhibitor, has p...","Cancer Biology,\n",30778887,"[32728410, 32708687, 32705178, 32143356]",4
