# HPO ID Mapping

In [60]:
import requests
import pandas as pd
import json
from pprint import pprint

In [61]:
# import the list of HPO DOIs
queryIds = pd.read_csv('missing_dois_for_lookup.csv')

In [62]:
queryIds.head()

Unnamed: 0,doi
0,10.1080/15374416.2014.945214
1,10.1016/j.jbi.2015.06.026
2,10.1007/s10803-017-3030-7
3,10.1016/j.psychres.2015.11.012
4,10.4018/978-1-5225-2492-2.ch006


## Method definitions

In [107]:
import xml.etree.ElementTree as ET

problemUrls = list()

def printProbUrls():
    pprint(problemUrls)

# check to see if object exists before returning it
def key_check(key, obj):
    if key in obj.keys():
        return str(obj[key])
    else:
        return None
    
def eutils_pmc_search(phrase):
    print(phrase)
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
    params = {
        'retmode':'json',
        'db':'pmc',
        'term':phrase,
        'retmax':1000
    }
    composedUrl = None

    try:
        results = requests.get(url=base_url, params=params)
        composedUrl = results.url
        resultsjson = results.json()
        allresults = resultsjson['esearchresult']['idlist']
        pprint(allresults)
        return allresults
    except Exception as e:
        print(e)
        print(composedUrl)
        problemUrls.append(composedUrl)
        return None

def epmc_id_lookup(pubid):
    """
    make a request to europmc api for publication data based on identifer
    arg: ext_id = pmid, pmcid, doi
    """
    base_url = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search'
    params = {
        'query':pubid,
        'resulttype':'core',
        'format': 'json'
    }
    
    try:
        results = requests.get(url=base_url, params=params)
#         print (results.url)
        resultsjson = results.json()
        allresults = resultsjson['resultList']['result']
        
        # if they don't have this ID (if this happens it is usually because of a DOI)
        if(len(allresults)==0):
            return None
        
        # else test if this is a DOI
        else:
            if("10." in pubid):
                """
                searching for PMIDs and PMCIDs gets you a single definitive result; for some reason
                ...searching by DOI gets several results, so iterate and find the one that matches the input ID  
                """
                found = False

                for aresult in allresults:
                    if "doi" in aresult.keys():
                        if (aresult['doi']) == pubid:
                            found = True
                            return aresult
                        # else queried ID is not a DOI but a PMCID or PMID,
                        # ergo the stanza we want is just at index 0
                # if we've iterated through all responses and haven't found a match
                if found==False:
                    print("Exact match not found "+results.url)
                #    pprint(allresults)
            else:
                print ("Warning, searched for "+pubid+ ", not a DOI")
                return allresults[0]

    except Exception as e:
        print("ERROR PARSING: "+ url + " at ")
        print(e)
        return None

def get_pub_yr_crossref(doi):
    """
    when the DOI has no corresponding pmid or pmcid, make a request to crossref api for pubyear
    """
    base_url = 'https://api.crossref.org/works/'
    url = base_url+doi

    try:
        results = requests.get(url).json()
#         pprint(results['message']["created"]["date-parts"][0][0])
 
        if (str(results)=="<Response [404]>"):
            print("NO RESULT: "+url)
            return None
        
        pubyr = None
#         if(results == "Resource not found.") return pubyr
    
        if "published-online" in results['message'].keys():
            pubyr = results['message']["published-online"]["date-parts"][0][0]
#             print("online pubyr: "+str(pubyr))
        else: 
            if "published-print" in results['message'].keys():
                pubyr = results['message']["published-print"]["date-parts"][0][0]  
            else: 
                if "created" in results['message'].keys():
                    pubyr = results['message']["created"]["date-parts"][0][0]   
        print(pubyr)
        return pubyr
    
    except Exception as f:
        print("ERROR PARSING: "+ url)
#         pprint(results)
        print(f)
        return None

## Main

In [115]:
print("Started ...")
problemUrls = list()
results = list()
for index, row in queryIds.iterrows():
    row_dict = {
        'pmcid': None,
        'doi': pubid,
        'pmid': None,
        'pubyear': None,
    }
    if index < 5000:
        if index%5==0:
            print(index)
        # to search by other id use row_dict['pmid]
        pubid = row[0]
        response = epmc_id_lookup(pubid)
        
        if response!=None:
            row_dict['doi'] = pubid #todo fix
            row_dict['pmcid'] = key_check('pmcid',response)
            row_dict['pmid'] = key_check('id',response)
            row_dict['pubyear'] = key_check('pubYear',response)
        else:
#           check to see if it is a DOI
            if ("10." in pubid):
                try: 
                    row_dict['pubyear'] = get_pub_yr_crossref(pubid)
                except Exception as f:
                    print("Failed at crossref: ")
                    print(f)
            else:
                print("Requested "+ pubid + " but this call can only be used with a DOI")
#       append the results, whether from epmc or crossref  
#         pprint(row_dict)   
        results.append(row_dict)
    
printProbUrls()


Started ...
0
5
10
15
20
Exact match not found http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=10.1186%2F2041-1480-4-42&resulttype=core&format=json
25
Exact match not found http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=10.1186%2F1752-0509-8-68&resulttype=core&format=json
Exact match not found http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=10.1186%2F2041-1480-5-23&resulttype=core&format=json
30
Exact match not found http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=10.1186%2F2041-1480-5-S1-S3&resulttype=core&format=json
35
40
45
Exact match not found http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=10.1186%2F1755-8794-8-S2-S9&resulttype=core&format=json
50
55
Exact match not found http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=10.1128%2FmBio.01263-15&resulttype=core&format=json
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
ERROR PARSING: https://api.crossref.org/works/10.3233/978-1-61499-753-5-131
Expec

520
525
ERROR PARSING: https://api.crossref.org/works/10.3233/978-1-60750-949-3-1033
Expecting value: line 1 column 1 (char 0)
ERROR PARSING: https://api.crossref.org/works/10.3233/978-1-60750-949-3-449
Expecting value: line 1 column 1 (char 0)
ERROR PARSING: https://api.crossref.org/works/10.3233/978-1-60750-949-3-74
Expecting value: line 1 column 1 (char 0)
530
ERROR PARSING: https://api.crossref.org/works/10.3233/978-1-60750-949-3-763
Expecting value: line 1 column 1 (char 0)
Exact match not found http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=10.1001%2Fjama.271.14.1103&resulttype=core&format=json
ERROR PARSING: https://api.crossref.org/works/10.1002/(SICI)1097-4571(199105)42:4&lt;297::AID-ASI6&gt;3.0.CO;2-M
Expecting value: line 1 column 1 (char 0)
535
ERROR PARSING: https://api.crossref.org/works/10.1002/(SICI)1097-4571(199506)46:5&lt;348::AID-ASI6&gt;3.0.CO;2-1
Expecting value: line 1 column 1 (char 0)
Exact match not found http://www.ebi.ac.uk/europepmc/webservices

In [116]:
newFrame = pd.DataFrame(data=results)
newFrame.to_csv('mapped_ids.csv')

In [None]:
newFrame