# HPO ID Mapping

In [1]:
import requests
import pandas as pd

In [2]:
exPMCID = 'PMC5210586'

In [3]:
def get_pub_data(ext_id):
    """
    make a request to europmc api for publication data based on identifer
    arg: ext_id = pmid, pmcid, doi
    """
    base_url = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search'
    params = {
        'query': ext_id,
        'resulttype': 'core',
        'format': 'json'
    }
    try:
        results = requests.get(url=base_url, params=params).json()
        return results['resultList']['result'][0]
    except Exception as e:
        print(e)
        return None

In [8]:
# import the incomplete hpo mapping data
hpo = pd.read_csv('HPO.csv')

In [10]:
hpo.head()

Unnamed: 0,PMC,DOI,PMID
0,PMC5639780,10.1186/s12859-017-1854-y,
1,PMC4422517,10.1186/s13073-015-0151-5,
2,PMC3965098,10.1093/nar/gkt1026,
3,PMC4117966,10.1186/1471-2105-15-248,
4,PMC4321842,10.1371/journal.pone.0115692,


In [6]:
# pmid and pubYear
results = list()
for index, row in hpo.iterrows():
    # limiting to 10 rows for testing
    if index < 10:
        pub = get_pub_data(row[0])
        results.append([row[0], row[1], pub['pmid'], pub['pubYear']])
newFrame = pd.DataFrame(data=results, columns=['pmid', 'doi', 'pmcid', 'pubyear'])

In [7]:
newFrame

Unnamed: 0,pmid,doi,pmcid,pubyear
0,PMC5639780,10.1186/s12859-017-1854-y,29025394,2017
1,PMC4422517,10.1186/s13073-015-0151-5,25949529,2015
2,PMC3965098,10.1093/nar/gkt1026,24217912,2014
3,PMC4117966,10.1186/1471-2105-15-248,25047600,2014
4,PMC4321842,10.1371/journal.pone.0115692,25664462,2015
5,PMC5210535,10.1093/nar/gkw1039,27899602,2017
6,PMC5635572,10.1186/s12859-017-1858-7,29017443,2017
7,PMC4572507,10.1016/j.ajhg.2015.05.020,26119816,2015
8,PMC4722686,10.12688/f1000research.6670.1,26834980,2015
9,PMC4343077,10.1093/database/bav005,25725061,2015


In [11]:
newFrame.to_csv('mapped_hpo_pub_ids.csv', sep=',')