# PMID to Year Map -- Europepmc

https://europepmc.org/downloads

In [1]:
import pandas as pd
from tqdm import tqdm
import gzip
import xml.etree.ElementTree as ET

In [2]:
!wget -O ../data/PMCLiteMetadata.tgz http://europepmc.org/ftp/pmclitemetadata/PMCLiteMetadata.tgz

--2018-02-22 16:27:20--  http://europepmc.org/ftp/pmclitemetadata/PMCLiteMetadata.tgz
Resolving europepmc.org (europepmc.org)... 193.62.193.83
Connecting to europepmc.org (europepmc.org)|193.62.193.83|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 604665773 (577M) [application/x-gzip]
Saving to: ‘../data/PMCLiteMetadata.tgz’


2018-02-22 16:31:56 (2.09 MB/s) - ‘../data/PMCLiteMetadata.tgz’ saved [604665773/604665773]



In [3]:
!tar -xvzf ../data/PMCLiteMetadata.tgz -C ../data

out/
out/PMC.10.xml
out/PMC.11.xml
out/PMC.12.xml
out/PMC.13.xml
out/PMC.14.xml
out/PMC.15.xml
out/PMC.16.xml
out/PMC.17.xml
out/PMC.18.xml
out/PMC.19.xml
out/PMC.1.xml
out/PMC.20.xml
out/PMC.21.xml
out/PMC.22.xml
out/PMC.23.xml
out/PMC.24.xml
out/PMC.2.xml
out/PMC.3.xml
out/PMC.4.xml
out/PMC.5.xml
out/PMC.6.xml
out/PMC.7.xml
out/PMC.8.xml
out/PMC.9.xml
out/PMC.0.xml


In [4]:
import xml.etree.ElementTree as ET

class XML2DataFrame:

    def __init__(self, xml_file):
        self.root = ET.parse(xml_file).getroot()

    def parse_root(self, root):
        """Return a list of dictionaries from the text
         and attributes of the children under this XML root."""
        return [self.parse_element(child) for child in root.getchildren()]

    def parse_element(self, element):
        """ Collect {key:attribute} and {tag:text} from thie XML
         element and all its children into a single dictionary of strings."""
        parsed = {c.tag: c.text for c in element.getchildren()}
        return parsed
 
    def process_data(self):
        """ Initiate the root XML, parse it, and return a dataframe"""
        structure_data = self.parse_root(self.root)
        return pd.DataFrame(structure_data)


In [5]:
frames = []
for i in tqdm(range(24)):
    xml2df = XML2DataFrame('../data/out/PMC.{}.xml'.format(i))
    xml_dataframe = xml2df.process_data()
    frames.append(xml_dataframe)
result = pd.concat(frames)

100%|██████████| 24/24 [06:00<00:00,  9.31s/it]


In [6]:
result.head(2)

Unnamed: 0,AuthorList,DOI,id,issue,journalIssn,journalTitle,journalVolume,pageInfo,pmcid,pmid,pubType,pubYear,source,title
0,\n,10.1111/j.1535-7511.2007.00218.x,18265880,1,1535-7597,Epilepsy currents,8,10-2,PMC2238650,18265880,"""Comment"", ""Journal Article""",2008,MED,Psychiatric comorbidity in children with epile...
1,\n,10.1371/journal.ppat.1000050,18421380,4,1553-7366,PLoS pathogens,4,e1000050,PMC2277462,18421380,"""Journal Article"", ""Research Support, Non-U.S....",2008,MED,Transmission pathways of foot-and-mouth diseas...


In [7]:
print('{:,}'.format(len(result)))

3,952,694


In [8]:
pmid_mapper = result.set_index('pmid')['pubYear'].to_dict()

In [9]:
import pickle
prev_no_map = pickle.load(open('../data/no_map_PMC.pkl', 'rb'))
print('{:,}'.format(len(prev_no_map)))

13,809,515


In [10]:
mapped = set(pmid_mapper.keys())
new_no_map = prev_no_map - mapped
print('{:,}'.format(len(new_no_map)))

13,809,345


In [11]:
pickle.dump(pmid_mapper, open('../data/pmid_to_year_Eur.pkl', 'wb'))
pickle.dump(new_no_map, open('../data/no_map_Eur.pkl', 'wb'))