# HPO ID Mapping

In [1]:
import requests
import pandas as pd
from pprint import pprint

In [2]:
# import the incomplete hpo mapping data
hpo = pd.read_csv('HPO.csv')

In [3]:
hpo.head(10)

Unnamed: 0,PMC,DOI,PMID
0,PMC5639780,10.1186/s12859-017-1854-y,
1,PMC4422517,10.1186/s13073-015-0151-5,
2,PMC3965098,10.1093/nar/gkt1026,
3,PMC4117966,10.1186/1471-2105-15-248,
4,PMC4321842,10.1371/journal.pone.0115692,
5,PMC5210535,10.1093/nar/gkw1039,
6,PMC5635572,10.1186/s12859-017-1858-7,
7,PMC4572507,10.1016/j.ajhg.2015.05.020,
8,PMC4722686,10.12688/f1000research.6670.1,
9,PMC4343077,10.1093/database/bav005,


In [4]:
import xml.etree.ElementTree as ET
def get_ids(doi):
    """
    make a request to europmc api for publication data based on identifer
    arg: ext_id = pmid, pmcid, doi
    """
    map_ids = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/'
    mi_params = {'tool': 'my_tool',
                 'ids': doi
                }
    mi_r = requests.get(url=map_ids, params=mi_params).text
    root = ET.fromstring(mi_r)
    record = root.find('record')
    return record.attrib
    
def get_date(ext_id):  
    base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
    params = {
        'db': 'pubmed',
        'id': ext_id,
        'retmode': 'json'
    }
    r= requests.get(url=base_url, params=params).json()
    return r['result'][ext_id]['sortpubdate'].split('/')[0]

In [13]:
results = list()
for index, row in hpo.iterrows():
    row_dict = {
        'pmcid': row[0],
        'doi': row[1],
        'pmid': row[2],
        'pubyear': None,
        'requested-id': None
    }
    if index < 12:
        try:
            ids = get_ids(row_dict['doi'])
            ids['pubyear'] = get_date(ids['pmid'])
            row_dict.update(ids)
            results.append(row_dict)
            print('{}  found'.format(row_dict['doi']))
        except Exception as e:
            results.append(row_dict)
            print('{}  not_found'.format(row_dict['doi']))

10.1186/s12859-017-1854-y  found
10.1186/s13073-015-0151-5  found
10.1093/nar/gkt1026  found
10.1186/1471-2105-15-248  found
10.1371/journal.pone.0115692  found
10.1093/nar/gkw1039  found
10.1186/s12859-017-1858-7  found
10.1016/j.ajhg.2015.05.020  found
10.12688/f1000research.6670.1  found
10.1093/database/bav005  found
10.1016/j.ajhg.2008.09.017  found
10.1186/s13073-016-0261-8  found


In [17]:
newFrame = pd.DataFrame(data=results)
newFrame.to_csv('mapped_ids.csv')

In [18]:
newFrame

Unnamed: 0,doi,pmcid,pmid,pubyear,requested-id
0,10.1186/s12859-017-1854-y,PMC5639780,29025394,2017,10.1186/S12859-017-1854-Y
1,10.1186/s13073-015-0151-5,PMC4422517,25949529,2015,10.1186/S13073-015-0151-5
2,10.1093/nar/gkt1026,PMC3965098,24217912,2014,10.1093/NAR/GKT1026
3,10.1186/1471-2105-15-248,PMC4117966,25047600,2014,10.1186/1471-2105-15-248
4,10.1371/journal.pone.0115692,PMC4321842,25664462,2015,10.1371/JOURNAL.PONE.0115692
5,10.1093/nar/gkw1039,PMC5210535,27899602,2017,10.1093/NAR/GKW1039
6,10.1186/s12859-017-1858-7,PMC5635572,29017443,2017,10.1186/S12859-017-1858-7
7,10.1016/j.ajhg.2015.05.020,PMC4572507,26119816,2015,10.1016/J.AJHG.2015.05.020
8,10.12688/f1000research.6670.1,PMC4722686,26834980,2015,10.12688/F1000RESEARCH.6670.1
9,10.1093/database/bav005,PMC4343077,25725061,2015,10.1093/DATABASE/BAV005
