In [1]:
import pandas as pd
from tqdm import tqdm
import gzip
import xml.etree.ElementTree as ET

Data downloaded from:  
ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/

In [2]:
def get_child_tag(child, tag):
    for c in child.getchildren():
        if c.tag == tag:
            return c

In [3]:
def get_year_from_article(article):
    journal = get_child_tag(article, 'Journal')
    issue = get_child_tag(journal, 'JournalIssue')
    pub_date = get_child_tag(issue, 'PubDate')
    year = get_child_tag(pub_date, 'Year')
    
    if year is not None:
        return year.text

In [4]:
def get_year_from_pubmed(pubmed_data):
    history = get_child_tag(pubmed_data, 'History')   
    for child in history.getchildren():
        if child.tag == 'PubMedPubDate' and child.items()[0][1] == 'pubmed':
            year = get_child_tag(child, 'Year')
    if year is not None:
        return year.text

In [5]:
def get_pmid_year(pubmed_article):
    medline_cit = get_child_tag(pubmed_article, 'MedlineCitation')
    pubmed_data = get_child_tag(pubmed_article, 'PubmedData')
    
    pmid = get_child_tag(medline_cit, 'PMID')
    try:
        year = get_year_from_pubmed(pubmed_data)
    except:
        article = get_child_tag(medline_cit, 'Article')
        year = get_year_from_article(article)
        
    if pmid is not None:
        pmid = pmid.text
    
    return pmid, year
    

In [6]:
id_to_year = {}

for i in tqdm(range(928), desc='xmlread'):
    tree = ET.parse(gzip.open('../data/baseline/pubmed18n{:04}.xml.gz'.format(i+1)))
    root = tree.getroot()

    for cit in root.getchildren():
        pmid, year = get_pmid_year(cit)
        id_to_year[pmid] = year
        
print('{:,}'.format(len(id_to_year)))
id_to_year_filt = {k:v for k, v, in id_to_year.items() if v is not None}
print('{:,}'.format(len(id_to_year_filt)))

xmlread: 100%|██████████| 928/928 [4:52:30<00:00, 20.01s/it]  


27,836,723
27,836,723


In [8]:
import pickle
prev_no_map = pickle.load(open('../data/no_map_Eur.pkl', 'rb'))

In [9]:
still_no_map = set(prev_no_map) - set(id_to_year.keys())
print('{:,}'.format(len(still_no_map)))

14,048


In [10]:
pickle.dump(id_to_year, open('../data/pmid_to_year_NLM.pkl', 'wb'))
pickle.dump(still_no_map, open('../data/no_map_NLM.pkl', 'wb'))