# HPO PMCID -> ID Mapping

In [52]:
import requests
import pandas as pd
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

In [68]:
def get_pub_data(ext_id):
    """
    make a request to europmc api for publication data based on identifer
    arg: ext_id = pmid, pmcid, doi
    """
    base_url = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search'
    params = {
        'query': ext_id,
        'resulttype': 'core',
        'format': 'json'
    }
    try:
        results = requests.get(url=base_url, params=params).json()
        return results['resultList']['result'][0]
    except Exception as e:
        print(e)
        return None

In [69]:
# import the incomplete hpo mapping data
hpo = pd.read_csv('hpo_pmc.csv')

In [70]:
hpo.head()

Unnamed: 0,PMC5639780
0,PMC4422517
1,PMC3965098
2,PMC4117966
3,PMC4321842
4,PMC5210535


In [71]:
# return: value of key if it exists
def key_check(key, obj):
    if key in obj.keys():
        return obj[key]
    else:
        return None

In [72]:
# pmid and pubYear
results = list()

for index, row in hpo.iterrows():

#     if index < 10:
        # fetch pub data from euroPMC
        pub = get_pub_data(row[0])
        print(row[0])
        results.append(
            [
                row[0], # PMC
                key_check('doi', pub),
                key_check('pmid', pub), 
                key_check('pubYear', pub)
            ]
        )
# create new dataframe with 
newFrame = pd.DataFrame(data=results, columns=['pmcid', 'doi', 'pmid', 'pubYear'])

PMC4422517
PMC3965098
PMC4117966
PMC4321842
PMC5210535
PMC5635572
PMC4572507
PMC4722686
PMC4343077
PMC2668030
PMC4736244
PMC5322674
PMC4933310
PMC4016257
PMC3578923
PMC5015320
PMC4180961
PMC4850887
PMC3799545
PMC5335876
PMC4419882
PMC4892996
PMC4916229
PMC3398700
PMC2756558
PMC4748471
PMC4279366
PMC5445140
PMC5050487
PMC5241210
PMC4827100
PMC4622021
PMC5314102
PMC3579732
PMC4512639
PMC4846568
PMC3572876
PMC5210521
PMC4702921
PMC3448526
PMC4383880
PMC4083413
PMC4944959
PMC3375301
PMC4660117
PMC3245026
PMC4383985
PMC3627299
PMC3531119
PMC3495645
PMC3479160
PMC3240574
PMC4148192
PMC4520011
PMC4477069
PMC3936824
PMC5210586
PMC3338382
PMC3974665
PMC4972611
PMC5570240
PMC5042483
PMC4944143
PMC4730103
PMC5210536
PMC4448677
PMC3814316
PMC4678794
PMC5333271
PMC5011059
PMC4574036
PMC2860848
PMC5473253
PMC4832335
PMC4525249
PMC3361058
PMC5447450
PMC4501634
PMC4210545
PMC4987917
PMC4653395
PMC4213015
PMC4804633
PMC5142268
PMC4287080
PMC3796529
PMC5514523
PMC4414390
PMC4362763
PMC5006367
PMC5002202

In [73]:
newFrame.to_csv('mapped_pubs.csv', sep=',')

In [74]:
years = newFrame['pubYear'].value_counts()

In [75]:
# example plot of number of pubs per year
iplot(years.iplot(
    asFigure=True,
    kind='bar',
    xTitle='Dates',
    yTitle='Publications',
    title='Mapped Publications'
)
     )

AttributeError: 'Series' object has no attribute 'iplot'