In [1]:
import pickle
import os
from datetime import date
from xml.etree import ElementTree as ET

In [2]:
with open('data/out/article_ids.pickle', 'rb') as f:
    article_ids = pickle.load(f)

In [3]:
len(article_ids)

3854270

In [4]:
with open('data/out/errors.pickle', 'rb') as f:
    error_list = pickle.load(f)

In [5]:
len(error_list)

45233

In [6]:
for n, fname, mesh in error_list:
    if n == 1:
        print(1)

From 03

In [7]:
selected_attributes = {
    'Abstract', # Check if they contain plain text and the section headings
                # BACKGROUND, METHODS, RESULTS, and CONCLUSIONS
                # or if they contain <AbstractText Label="HEADING_NAME">...</AbstractText>
    'AbstractText',
#     'AccessionNumber',
#     'AccessionNumberList',
    'Acronym',
    'Affiliation',
    'AffiliationInfo',
    'Agency',
    'Article',
#     'ArticleDate', # Empty
#     'ArticleId',
#     'ArticleIdList',
    'ArticleTitle',
    'Author',
    'AuthorList',
#     'Chemical',
#     'ChemicalList',
#     'CitationSubset',
    'CollectiveName',
#     'CommentsCorrections',
#     'CommentsCorrectionsList',
#     'CopyrightInformation',
    'Country',
#     'DataBank',
#     'DataBankList',
#     'DataBankName',
#     'DateCompleted',
#     'DateCreated',
#     'DateRevised',
    'Day',
    'DescriptorName',
#     'ELocationID', # only PMID
    'ForeName',
#     'GeneSymbol',
#     'GeneSymbolList',
#     'GeneralNote', # noise
    'Grant',
    'GrantID',
    'GrantList',
#     'History',
#     'Hour',
#     'ISOAbbreviation',
#     'ISSN',
#     'ISSNLinking',
    'Identifier',
    'Initials',
#     'Investigator', # Not author
#     'InvestigatorList',
    'Issue',
    'Journal',
    'JournalIssue',
    'Keyword',
    'KeywordList',
    'Language',
    'LastName',
    'MedlineCitation',
#     'MedlineDate',
#     'MedlineJournalInfo', # ?
#     'MedlinePgn',
#     'MedlineTA',
    'MeshHeading',
    'MeshHeadingList',
#     'Minute',
    'Month',
#     'NameOfSubstance',
#     'NlmUniqueID',
#     'Note',
    'NumberOfReferences',
#     'OtherAbstract',
#     'OtherID',
    'PMID',
#     'Pagination',
#     'PersonalNameSubject',
#     'PersonalNameSubjectList',
    'PubDate',
#     'PubMedPubDate',
#     'PublicationStatus',
    'PublicationType',
    'PublicationTypeList',
    'PubmedArticle',
    'PubmedArticleSet',
#     'PubmedData',
    'QualifierName',
#     'RefSource',
#     'RegistryNumber',
#     'Season',
#     'SpaceFlightMission',
    'Suffix',
#     'SupplMeshList',
#     'SupplMeshName',
    'Title',
#     'VernacularTitle',
    'Volume',
    'Year',
}

In [8]:
def get_abstract(a):
    r = None
    abstract = a.find('MedlineCitation/Article/Abstract')
    if abstract is not None:
        r = abstract.text + '\n'
        for abs_text in abstract.findall('AbstractText'):
            if abs_text.text:
                r += abs_text.text + '\n'
        r = r.strip()
    return r

def get_title(a):
    r = None
    title = a.find('MedlineCitation/Article/ArticleTitle')
    if title is not None:
        r = title.text
    return r

def get_authors(a):
    r = None
    authors = a.find('MedlineCitation/Article/AuthorList')
    if authors is not None:
        r = []
        for author in authors.findall('Author'):
            auth = {}
            affiliation = author.find('AffiliationInfo/Affiliation')
            if affiliation is not None:
                auth['affiliation'] = affiliation.text
            collective_name = author.find('CollectiveName')
            if collective_name is not None:
                auth['collective_name'] = collective_name.text
            first_name = author.find('ForeName')
            if first_name is not None:
                auth['first_name'] = first_name.text
            identifier = author.find('Identifier')
            if identifier is not None:
                auth['identifier'] = identifier.text
            initials = author.find('Initials')
            if initials is not None:
                auth['initials'] = initials.text
            last_name = author.find('LastName')
            if last_name is not None:
                auth['last_name'] = last_name.text
            suffix = author.find('Suffix')
            if suffix is not None:
                auth['suffix'] = suffix.text
            r.append(auth)
    return r

def get_grants(a):
    r = None
    grants = a.find('MedlineCitation/Article/GrantList')
    if grants is not None:
        r = []
        for grant in grants.findall('Grant'):
            gr = {}
            acronym = grant.find('Acronym')
            if acronym is not None:
                gr['acronym'] = acronym.text
            agency = grant.find('Agency')
            if agency is not None:
                gr['agency'] = agency.text
            country = grant.find('Country')
            if country is not None:
                gr['country'] = country.text
            grant_id = grant.find('GrantId')
            if grant_id is not None:
                gr['grant_id'] = grant_id.text
            r.append(gr)
    return r

def get_pub_date(dat):
    r = None
    year = dat.find('Year')
    if year is not None:
        y = int(year.text)
        month = dat.find('Month')
        if month is not None:
            try:
                m = int(month.text)
            except:
                ms = {
                    'jan': 1,
                    'feb': 2,
                    'mar': 3,
                    'apr': 4,
                    'may': 5,
                    'jun': 6,
                    'jul': 7,
                    'aug': 8,
                    'sep': 9,
                    'oct': 10,
                    'nov': 11,
                    'dec': 12,
                }
                m = ms[month.text.lower()[:3]]
        else:
            m = 1
        day = dat.find('Day')
        if day is not None:
            d = int(day.text)
        else:
            d = 1
        try:
            r = date(y, m, d)
        except Exception as e:
            print(e)
#             pass
    return r

def get_pubmed_date(a):
    r = None
    dat = a.find('MedlineCitation/DateCreated')
    if dat is not None:
        year = dat.find('Year')
        month = dat.find('Month')
        day = dat.find('Day')
        y = int(year.text)
        m = int(month.text)
        d = int(day.text)
        r = date(y, m, d)
    return r

def get_journal_info(a):
    r1, r2 = None, None
    journal = a.find('MedlineCitation/Article/Journal')
    if journal is not None:
        r2 = {}
        title = journal.find('Title')
        if title is not None:
            r2['title'] = title.text
        iso_abbr = journal.find('ISOAbbreviation')
        if iso_abbr is not None:
            r2['iso_abbr'] = iso_abbr.text
        journal_issue = journal.find('JournalIssue')
        issn = journal.find('ISSN')
        if issn is not None:
            r2['issn'] = issn.text
        journal_issue = journal.find('JournalIssue')
        if journal_issue is not None:
            volume = journal_issue.find('Volume')
            if volume is not None:
                r2['volume'] = volume.text
            issue = journal_issue.find('Issue')
            if issue is not None:
                r2['issue'] = issue.text
            pub_date = journal_issue.find('PubDate')
            if pub_date is not None:
                r1 = get_pub_date(pub_date)
    return r1, r2

def get_language(a):
    r = None
    lang = a.find('MedlineCitation/Article/Language')
    if lang is not None:
        r = lang.text
    return r

def get_publication_type(a):
    r = None
    types = a.find('MedlineCitation/Article/PublicationTypeList')
    if types is not None:
        r = []
        for typ in types.findall('PublicationType'):
            r.append(typ.text)
    return r

def get_keywords(a):
    r = None
    keywords = a.find('MedlineCitation/KeywordList')
    if keywords is not None:
        r = []
        for keyword in keywords.findall('Keyword'):
            r.append(keyword.text)
    return r

def get_mesh_headings(a):
    r = None
    mesh_headings = a.find('MedlineCitation/MeshHeadingList')
    if mesh_headings is not None:
        r = []
        for mesh_heading in mesh_headings.findall('MeshHeading/DescriptorName'):
            r.append(mesh_heading.text)
    return r

def get_num_references(a):
    r = None
    num = a.find('MedlineCitation/NumberOfReferences')
    if num is not None:
        r = int(num.text)
    return r

In [None]:
%%time

articles = []

for filename in sorted(os.listdir('data/src/')):
    print(filename)
    filepath = 'data/src/' + filename
    root = ET.parse(filepath).getroot()
    for a in root.findall('PubmedArticle'):
        pmid = a.find('MedlineCitation/PMID').text
        if pmid in article_ids:
            abstract = get_abstract(a)
            title = get_title(a)
            pubmed_date = get_pubmed_date(a)
            authors = get_authors(a)
            grants = get_grants(a)
            pub_date, journal_info = get_journal_info(a)
            lang = get_language(a)
            pub_type = get_publication_type(a)
            keywords = get_keywords(a)
            mesh_headings = get_mesh_headings(a)
            n_refs = get_num_references(a)
            article = {
                'id_': pmid,
                'title': title,
                'authors': authors,
                'abstract': abstract,
                'pubmed_date': pubmed_date,
                'pub_date': pub_date,
                'language': lang,
                'journal_info': journal_info,
                'pub_type': pub_type,
                'keywords': keywords,
                'mesh_headings': mesh_headings,
                'n_refs': n_refs,
                'grants': grants,
            }
            articles.append(article)
    root.clear()

medline17n0001.xml
medline17n0002.xml
medline17n0003.xml
medline17n0004.xml
medline17n0005.xml
medline17n0006.xml
medline17n0007.xml
medline17n0008.xml
medline17n0009.xml
medline17n0010.xml
medline17n0011.xml
medline17n0012.xml
medline17n0013.xml
medline17n0014.xml
medline17n0015.xml
medline17n0016.xml
medline17n0017.xml
medline17n0018.xml
medline17n0019.xml
medline17n0020.xml
medline17n0021.xml
medline17n0022.xml
medline17n0023.xml
medline17n0024.xml
medline17n0025.xml
medline17n0026.xml
medline17n0027.xml
medline17n0028.xml
medline17n0029.xml
medline17n0030.xml
medline17n0031.xml
medline17n0032.xml
medline17n0033.xml
medline17n0034.xml
medline17n0035.xml
medline17n0036.xml
medline17n0037.xml
medline17n0038.xml
medline17n0039.xml
medline17n0040.xml
medline17n0041.xml
medline17n0042.xml
medline17n0043.xml
medline17n0044.xml
medline17n0045.xml
medline17n0046.xml
medline17n0047.xml
medline17n0048.xml
medline17n0049.xml
medline17n0050.xml
medline17n0051.xml
medline17n0052.xml
medline17n00

In [None]:
len(articles)

In [None]:
article_ids[:10]

In [None]:
with open('data/out/articles_data.pickle', 'wb') as f:
    pickle.dump(articles, f)