In [None]:
import glob, os
import xml.etree.ElementTree as ET

In [None]:
def process_data(path=None):
    """
    Parameters
    ----------
    path : string
        Path to folder with xml files from pubmed

    Returns
    -------
    articles : dict
        Dictionary, in which every element looks like
            PMID: [title, abstract, dict(chemical_id: chemical_name), [mesh_headings]]
    """
    fields = ['MedlineCitation/PMID', 'MedlineCitation/Article/ArticleTitle', 'MedlineCitation/Article/Abstract/',
             'MedlineCitation/ChemicalList/', 'MedlineCitation/MeshHeadingList/']
    articles = {}
    if path:
        os.chdir(path)
    for file in glob.glob("*.xml"):
        article_set = ET.parse(file).getroot()
        for article in article_set: 
            article_id = article.find(fields[0])
            title = article.find(fields[1])
            abstract = article.find(fields[2])
            chemicals = article.findall(fields[3])
            mesh_headings = article.findall(fields[4])
            # skip article if there's any feature missing
            if article_id is not None and title is not None and abstract is not None and chemicals and mesh_headings: 
                articles[article_id.text] = [title.text, abstract.text, [convert_chemicals_to_list(chemicals)],
                                             [convert_mesh_headings_to_list(mesh_headings)]]
    return articles

In [None]:
def convert_chemicals_to_list(elements):
    results = {}
    for element in elements:
        element_id, name = element.getchildren()
        results[element_id.text] = name.text
    return results

def convert_mesh_headings_to_list(elements):
    results = []
    for element in elements:
        children = element.getchildren()
        results += [child.text for child in children]
    return results

In [None]:
data = process_data()

    Example

  <PubmedArticle>
    <MedlineCitation Status="MEDLINE" Owner="NLM">
      <PMID Version="1">304747</PMID>
      <DateCreated>
        <Year>1978</Year>
        <Month>04</Month>
        <Day>17</Day>
      </DateCreated>
      <DateCompleted>
        <Year>1978</Year>
        <Month>04</Month>
        <Day>17</Day>
      </DateCompleted>
      <DateRevised>
        <Year>2004</Year>
        <Month>11</Month>
        <Day>17</Day>
      </DateRevised>
      <Article PubModel="Print">
        <Journal>
          <ISSN IssnType="Print">0006-4971</ISSN>
          <JournalIssue CitedMedium="Print">
            <Volume>51</Volume>
            <Issue>3</Issue>
            <PubDate>
              <Year>1978</Year>
              <Month>Mar</Month>
            </PubDate>
          </JournalIssue>
          <Title>Blood</Title>
          <ISOAbbreviation>Blood</ISOAbbreviation>
        </Journal>
        <ArticleTitle>Lymphocyte receptors for concanavalin A in Hodgkin disease.</ArticleTitle>
        <Pagination>
          <MedlinePgn>439-43</MedlinePgn>
        </Pagination>
        <Abstract>
          <AbstractText>The number of lymphocytes with mobile receptors for concanavalin A (Con A) on their surface membrane (forming visible caps after the addition of fluorescein-conjugated Con A) was determined in the peripheral blood of 53 patients with Hodgkin disease. Of 29 individuals studied prior to treatment, the level of capped cells was found to be below the normal range in 9 of 13 in stages I and IIA, 6 of 8 in stage IIIA, and all 8 in stages IIIB and IV. Even among patients in remission 2 yr after successful treatment the level was below the lower normal limit in 9 of 16. The number was also reduced in 7 of 8 individuals with recurrent lymphoma. The level of lymphocytes that cap with Con A may prove to be a more sensitive measure of active Hodgkin disease than the total peripheral lymphocyte count or the level of T cells. This lymphocyte parameter merits further study as a correlate in vitro of cellular immunity.</AbstractText>
        </Abstract>
        <AuthorList CompleteYN="Y">
          <Author ValidYN="Y">
            <LastName>Aisenberg</LastName>
            <ForeName>A C</ForeName>
            <Initials>AC</Initials>
          </Author>
          <Author ValidYN="Y">
            <LastName>Weitzman</LastName>
            <ForeName>S</ForeName>
            <Initials>S</Initials>
          </Author>
          <Author ValidYN="Y">
            <LastName>Wilkes</LastName>
            <ForeName>B</ForeName>
            <Initials>B</Initials>
          </Author>
        </AuthorList>
        <Language>eng</Language>
        <PublicationTypeList>
          <PublicationType UI="D016428">Journal Article</PublicationType>
        </PublicationTypeList>
      </Article>
      <MedlineJournalInfo>
        <Country>United States</Country>
        <MedlineTA>Blood</MedlineTA>
        <NlmUniqueID>7603509</NlmUniqueID>
        <ISSNLinking>0006-4971</ISSNLinking>
      </MedlineJournalInfo>
      <ChemicalList>
        <Chemical>
          <RegistryNumber>11028-71-0</RegistryNumber>
          <NameOfSubstance UI="D003208">Concanavalin A</NameOfSubstance>
        </Chemical>
      </ChemicalList>
      <CitationSubset>AIM</CitationSubset>
      <CitationSubset>IM</CitationSubset>
      <MeshHeadingList>
        <MeshHeading>
          <DescriptorName UI="D001665" MajorTopicYN="N">Binding Sites</DescriptorName>
        </MeshHeading>
        <MeshHeading>
          <DescriptorName UI="D003208" MajorTopicYN="N">Concanavalin A</DescriptorName>
          <QualifierName UI="Q000494" MajorTopicYN="Y">pharmacology</QualifierName>
        </MeshHeading>
        <MeshHeading>
          <DescriptorName UI="D006689" MajorTopicYN="N">Hodgkin Disease</DescriptorName>
          <QualifierName UI="Q000276" MajorTopicYN="Y">immunology</QualifierName>
        </MeshHeading>
        <MeshHeading>
          <DescriptorName UI="D006801" MajorTopicYN="N">Humans</DescriptorName>
        </MeshHeading>
        <MeshHeading>
          <DescriptorName UI="D007958" MajorTopicYN="N">Leukocyte Count</DescriptorName>
        </MeshHeading>
        <MeshHeading>
          <DescriptorName UI="D008214" MajorTopicYN="N">Lymphocytes</DescriptorName>
          <QualifierName UI="Q000276" MajorTopicYN="Y">immunology</QualifierName>
        </MeshHeading>
        <MeshHeading>
          <DescriptorName UI="D013601" MajorTopicYN="N">T-Lymphocytes</DescriptorName>
        </MeshHeading>
      </MeshHeadingList>
    </MedlineCitation>
    <PubmedData>
      <History>
        <PubMedPubDate PubStatus="pubmed">
          <Year>1978</Year>
          <Month>3</Month>
          <Day>1</Day>
        </PubMedPubDate>
        <PubMedPubDate PubStatus="medline">
          <Year>1978</Year>
          <Month>3</Month>
          <Day>1</Day>
          <Hour>0</Hour>
          <Minute>1</Minute>
        </PubMedPubDate>
        <PubMedPubDate PubStatus="entrez">
          <Year>1978</Year>
          <Month>3</Month>
          <Day>1</Day>
          <Hour>0</Hour>
          <Minute>0</Minute>
        </PubMedPubDate>
      </History>
      <PublicationStatus>ppublish</PublicationStatus>
      <ArticleIdList>
        <ArticleId IdType="pubmed">304747</ArticleId>
      </ArticleIdList>
    </PubmedData>
  </PubmedArticle>