# HPO ID Mapping

In [None]:
import requests
import pandas as pd
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [3]:
def get_pub_data(ext_id):
    """
    make a request to europmc api for publication data based on identifer
    arg: ext_id = pmid, pmcid, doi
    """
    base_url = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search'
    params = {
        'query': ext_id,
        'resulttype': 'core',
        'format': 'json'
    }
    try:
        results = requests.get(url=base_url, params=params).json()
        return results['resultList']['result'][0]
    except Exception as e:
        print(e)
        return None

In [4]:
# import the incomplete hpo mapping data
hpo = pd.read_csv('HPO.csv')

In [5]:
hpo.head()

Unnamed: 0,PMC,DOI,PMID
0,PMC5639780,10.1186/s12859-017-1854-y,
1,PMC4422517,10.1186/s13073-015-0151-5,
2,PMC3965098,10.1093/nar/gkt1026,
3,PMC4117966,10.1186/1471-2105-15-248,
4,PMC4321842,10.1371/journal.pone.0115692,


In [51]:
# return: value of key if it exists
def key_check(key, obj):
    if key in obj.keys():
        return obj[key]
    else:
        return None

In [45]:
# pmid and pubYear
results = list()
for index, row in hpo.iterrows():

    if index < 100:
        # fetch pub data from euroPMC
        pub = get_pub_data(row[0])
        results.append(
            [
                row[0], # PMC
                row[1], # doi
                key_check('pmid', pub), 
                key_check('pubYear', pub)
            ]
        )
# create new dataframe with 
newFrame = pd.DataFrame(data=results, columns=['pmcid', 'doi', 'pmid', 'pubYear'])

In [52]:
newFrame.to_csv('mapped_pubs.csv', sep=',')

In [47]:
years = newFrame['pubYear'].value_counts()

In [54]:
# example plot of number of pubs per year
iplot(years.iplot(
    asFigure=True,
    kind='bar',
    xTitle='Dates',
    yTitle='Publications',
    title='Mapped Publications'
)
     )