# HPO ID Mapping

This script takes a list of IDs of type PMID, PMCID, or DOI, a specification of what type to expect, and returns a CSV of corresponding mapped IDs (if available), together with the publication year. Format the input file with one ID per line with PMIDs as bare numerics like 27899602, PMCIDs like PMC5210535 and DOIs like 10.1093/nar/gkw1039

In [61]:
import requests
import pandas as pd
import json
from pprint import pprint

In [62]:
# import the list of HPO DOIs
queryIds = pd.read_csv('missing_dois_for_lookup.csv')
inputType = "doi"

In [63]:
queryIds.head()

Unnamed: 0,doi
0,10.1001/jama.280.15.1325
1,10.1006/jbin.2002.1035
2,10.1007/978-1-4471-2801-4
3,10.1007/978-3-319-21843-4_3
4,10.1007/978-3-319-23344-4_15


## Method definitions

In [64]:
import xml.etree.ElementTree as ET

problemUrls = list()

def printProbUrls():
    pprint(problemUrls)

# check to see if object exists before returning it
def key_check(key, obj):
    if key in obj.keys():
        return str(obj[key])
    else:
        return None
    
def eutils_pmc_search(phrase):
    print(phrase)
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
    params = {
        'retmode':'json',
        'db':'pmc',
        'term':phrase,
        'retmax':1000
    }
    composedUrl = None

    try:
        results = requests.get(url=base_url, params=params)
        composedUrl = results.url
        resultsjson = results.json()
        allresults = resultsjson['esearchresult']['idlist']
        pprint(allresults)
        return allresults
    except Exception as e:
        print(e)
        print(composedUrl)
        problemUrls.append(composedUrl)
        return None

def epmc_id_lookup(pubid):
    """
    make a request to europmc api for publication data based on identifer
    arg: ext_id = pmid, pmcid, doi
    """
    base_url = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search'
    params = {
        'query':pubid,
        'resulttype':'core',
        'format': 'json'
    }
    
    try:
        results = requests.get(url=base_url, params=params)
#         print (results.url)
        resultsjson = results.json()
        allresults = resultsjson['resultList']['result']
        
        # if they don't have this ID (if this happens it is usually because of a DOI)
        if(len(allresults)==0):
            return None
        
        # else test if this is a DOI
        else:
            if("10." in str(pubid)):
                """
                searching for PMIDs and PMCIDs gets you a single definitive result; for some reason
                ...searching by DOI gets several results, so iterate and find the one that matches the input ID  
                """
                found = False

                for aresult in allresults:
                    if "doi" in aresult.keys():
                        if (aresult['doi']) == pubid:
                            found = True
                            return aresult
                        # else queried ID is not a DOI but a PMCID or PMID,
                        # ergo the stanza we want is just at index 0
                # if we've iterated through all responses and haven't found a match
                if found==False:
                    print("Exact match not found "+results.url)
                #    pprint(allresults)
            else:
#                 print ("Warning, searched for "+str(pubid)+ ", not a DOI")
                return allresults[0]

    except Exception as e:
        print("ERROR PARSING: "+ url + " at ")
        print(e)
        return None

def get_pub_yr_crossref(doi):
    """
    when the DOI has no corresponding pmid or pmcid, make a request to crossref api for pubyear
    """
    base_url = 'https://api.crossref.org/works/'
    url = base_url+doi

    try:
        results = requests.get(url).json()
#         pprint(results['message']["created"]["date-parts"][0][0])
 
        if (str(results)=="<Response [404]>"):
            print("NO RESULT: "+url)
            return None
        
        pubyr = None
#         if(results == "Resource not found.") return pubyr
    
        if "published-online" in results['message'].keys():
            pubyr = results['message']["published-online"]["date-parts"][0][0]
#             print("online pubyr: "+str(pubyr))
        else: 
            if "published-print" in results['message'].keys():
                pubyr = results['message']["published-print"]["date-parts"][0][0]  
            else: 
                if "created" in results['message'].keys():
                    pubyr = results['message']["created"]["date-parts"][0][0]   
#         print(pubyr)
        return pubyr
    
    except Exception as f:
        if "Expecting value: line 1 column 1 (char 0)" in str(f):
            print(doi +" not found in crossref")
        else:
            print("ERROR PARSING: "+ url)
            print(f)
        return None

## Main

In [65]:
print("Started ...")
problemUrls = list()
results = list()

for index, row in queryIds.iterrows():
    row_dict = {
        'doi': row[0] if inputType == 'doi' else None,
        'pmid': row[0] if inputType == 'pmid' else None,
        'pmcid': row[0] if inputType == 'pmcid' else None,
        'pubyear': None
    }
    if index <5:
        if index%50==0:
            print(index)
        # to search by other id use row_dict['pmid]
        pubid = row[0]
        response = epmc_id_lookup(pubid)
        
        if response!=None:
            row_dict['doi'] = pubid if inputType == 'doi' else key_check('doi',response)
            row_dict['pmid'] = pubid if inputType == 'pmid' else key_check('pmid',response)
            row_dict['pmcid'] = pubid if inputType == 'pmcid' else key_check('pmcid',response)
            row_dict['pubyear'] = key_check('pubYear',response)
        else:
#           check to see if it is a DOI
            if ("10." in pubid):
                try: 
                    row_dict['pubyear'] = get_pub_yr_crossref(pubid)
                except Exception as f:
                    print("Failed at crossref: ")
                    print(f)
            else:
                print("Requested "+ pubid + " but this call can only be used with a DOI")
#       append the results, whether from epmc or crossref  
#         pprint(row_dict)   
        results.append(row_dict)
    
printProbUrls()


Started ...
0
[]


In [66]:
newFrame = pd.DataFrame(data=results)
newFrame.to_csv('mapped_ids.csv')