In [5]:
import os
from nltk.tree import Tree
from xml.etree import ElementTree as ET

From https://www.ncbi.nlm.nih.gov/books/NBK3828/

In [5]:
xml_tags = {
#     'File Header',
    'ArticleSet',
    'Article',
    'Journal',
    'PublisherName',
    'JournalTitle',
    'Issn',
    'Volume',
    'Issue',
    'PubDate',
    'Year',
    'Month',
    'Season',
    'Day',
    'Replaces',
    'ArticleTitle',
    'VernacularTitle',
    'FirstPage',
    'LastPage',
    'ELocationID',
    'Language',
    'AuthorList',
    'Author',
    'FirstName',
    'MiddleName',
    'LastName',
    'Suffix',
    'CollectiveName',
    'Affiliation',
    'Identifier',
    'GroupList',
    'Group',
    'GroupName',
    'IndividualName',
    'PublicationType',
    'ArticleIdList',
    'ArticleId',
    'History',
    'Abstract',
    'OtherAbstract',
    'CopyrightInformation',
    'CoiStatement',
    'ObjectList',
    'Object',
    'Param',
}

From 02

In [1]:
plot_attributes = {
    ('Abstract', 'AbstractText'),
    ('Abstract', 'CopyrightInformation'),
    ('AccessionNumberList', 'AccessionNumber'),
    ('AffiliationInfo', 'Affiliation'),
    ('Article', 'Abstract'),
    ('Article', 'ArticleDate'),
    ('Article', 'ArticleTitle'),
    ('Article', 'AuthorList'),
    ('Article', 'DataBankList'),
    ('Article', 'ELocationID'),
    ('Article', 'GrantList'),
    ('Article', 'Journal'),
    ('Article', 'Language'),
    ('Article', 'Pagination'),
    ('Article', 'PublicationTypeList'),
    ('Article', 'VernacularTitle'),
    ('ArticleDate', 'Day'),
    ('ArticleDate', 'Month'),
    ('ArticleDate', 'Year'),
    ('ArticleIdList', 'ArticleId'),
    ('Author', 'AffiliationInfo'),
    ('Author', 'CollectiveName'),
    ('Author', 'ForeName'),
    ('Author', 'Identifier'),
    ('Author', 'Initials'),
    ('Author', 'LastName'),
    ('Author', 'Suffix'),
    ('AuthorList', 'Author'),
    ('Chemical', 'NameOfSubstance'),
    ('Chemical', 'RegistryNumber'),
    ('ChemicalList', 'Chemical'),
    ('CommentsCorrections', 'Note'),
    ('CommentsCorrections', 'PMID'),
    ('CommentsCorrections', 'RefSource'),
    ('CommentsCorrectionsList', 'CommentsCorrections'),
    ('DataBank', 'AccessionNumberList'),
    ('DataBank', 'DataBankName'),
    ('DataBankList', 'DataBank'),
    ('DateCompleted', 'Day'),
    ('DateCompleted', 'Month'),
    ('DateCompleted', 'Year'),
    ('DateCreated', 'Day'),
    ('DateCreated', 'Month'),
    ('DateCreated', 'Year'),
    ('DateRevised', 'Day'),
    ('DateRevised', 'Month'),
    ('DateRevised', 'Year'),
    ('GeneSymbolList', 'GeneSymbol'),
    ('Grant', 'Acronym'),
    ('Grant', 'Agency'),
    ('Grant', 'Country'),
    ('Grant', 'GrantID'),
    ('GrantList', 'Grant'),
    ('History', 'PubMedPubDate'),
    ('Investigator', 'AffiliationInfo'),
    ('Investigator', 'ForeName'),
    ('Investigator', 'Initials'),
    ('Investigator', 'LastName'),
    ('Investigator', 'Suffix'),
    ('InvestigatorList', 'Investigator'),
    ('Journal', 'ISOAbbreviation'),
    ('Journal', 'ISSN'),
    ('Journal', 'JournalIssue'),
    ('Journal', 'Title'),
    ('JournalIssue', 'Issue'),
    ('JournalIssue', 'PubDate'),
    ('JournalIssue', 'Volume'),
    ('KeywordList', 'Keyword'),
    ('MedlineCitation', 'Article'),
    ('MedlineCitation', 'ChemicalList'),
    ('MedlineCitation', 'CitationSubset'),
    ('MedlineCitation', 'CommentsCorrectionsList'),
    ('MedlineCitation', 'DateCompleted'),
    ('MedlineCitation', 'DateCreated'),
    ('MedlineCitation', 'DateRevised'),
    ('MedlineCitation', 'GeneSymbolList'),
    ('MedlineCitation', 'GeneralNote'),
    ('MedlineCitation', 'InvestigatorList'),
    ('MedlineCitation', 'KeywordList'),
    ('MedlineCitation', 'MedlineJournalInfo'),
    ('MedlineCitation', 'MeshHeadingList'),
    ('MedlineCitation', 'NumberOfReferences'),
    ('MedlineCitation', 'OtherAbstract'),
    ('MedlineCitation', 'OtherID'),
    ('MedlineCitation', 'PMID'),
    ('MedlineCitation', 'PersonalNameSubjectList'),
    ('MedlineCitation', 'SpaceFlightMission'),
    ('MedlineCitation', 'SupplMeshList'),
    ('MedlineJournalInfo', 'Country'),
    ('MedlineJournalInfo', 'ISSNLinking'),
    ('MedlineJournalInfo', 'MedlineTA'),
    ('MedlineJournalInfo', 'NlmUniqueID'),
    ('MeshHeading', 'DescriptorName'),
    ('MeshHeading', 'QualifierName'),
    ('MeshHeadingList', 'MeshHeading'),
    ('OtherAbstract', 'AbstractText'),
    ('Pagination', 'MedlinePgn'),
    ('PersonalNameSubject', 'ForeName'),
    ('PersonalNameSubject', 'Initials'),
    ('PersonalNameSubject', 'LastName'),
    ('PersonalNameSubject', 'Suffix'),
    ('PersonalNameSubjectList', 'PersonalNameSubject'),
    ('PubDate', 'Day'),
    ('PubDate', 'MedlineDate'),
    ('PubDate', 'Month'),
    ('PubDate', 'Season'),
    ('PubDate', 'Year'),
    ('PubMedPubDate', 'Day'),
    ('PubMedPubDate', 'Hour'),
    ('PubMedPubDate', 'Minute'),
    ('PubMedPubDate', 'Month'),
    ('PubMedPubDate', 'Year'),
    ('PublicationTypeList', 'PublicationType'),
    ('PubmedArticle', 'MedlineCitation'),
    ('PubmedArticle', 'PubmedData'),
    ('PubmedArticleSet', 'PubmedArticle'),
    ('PubmedData', 'ArticleIdList'),
    ('PubmedData', 'History'),
    ('PubmedData', 'PublicationStatus'),
    ('SupplMeshList', 'SupplMeshName'),
}

In [2]:
attributes = set()
for a, b in plot_attributes:
    attributes.add(a)
    attributes.add(b)

In [6]:
attributes == xml_tags

False

In [9]:
attributes

{'Abstract',
 'AbstractText',
 'AccessionNumber',
 'AccessionNumberList',
 'Acronym',
 'Affiliation',
 'AffiliationInfo',
 'Agency',
 'Article',
 'ArticleDate',
 'ArticleId',
 'ArticleIdList',
 'ArticleTitle',
 'Author',
 'AuthorList',
 'Chemical',
 'ChemicalList',
 'CitationSubset',
 'CollectiveName',
 'CommentsCorrections',
 'CommentsCorrectionsList',
 'CopyrightInformation',
 'Country',
 'DataBank',
 'DataBankList',
 'DataBankName',
 'DateCompleted',
 'DateCreated',
 'DateRevised',
 'Day',
 'DescriptorName',
 'ELocationID',
 'ForeName',
 'GeneSymbol',
 'GeneSymbolList',
 'GeneralNote',
 'Grant',
 'GrantID',
 'GrantList',
 'History',
 'Hour',
 'ISOAbbreviation',
 'ISSN',
 'ISSNLinking',
 'Identifier',
 'Initials',
 'Investigator',
 'InvestigatorList',
 'Issue',
 'Journal',
 'JournalIssue',
 'Keyword',
 'KeywordList',
 'Language',
 'LastName',
 'MedlineCitation',
 'MedlineDate',
 'MedlineJournalInfo',
 'MedlinePgn',
 'MedlineTA',
 'MeshHeading',
 'MeshHeadingList',
 'Minute',
 'Month'

In [None]:
selected_attributes = {
    'Abstract', # Check if they contain plain text and the section headings
                # BACKGROUND, METHODS, RESULTS, and CONCLUSIONS
                # or if they contain <AbstractText Label="HEADING_NAME">...</AbstractText>
    'AbstractText',
#     'AccessionNumber',
#     'AccessionNumberList',
    'Acronym',
    'Affiliation',
    'AffiliationInfo',
    'Agency',
    'Article',
#     'ArticleDate', # Empty
#     'ArticleId',
#     'ArticleIdList',
    'ArticleTitle',
    'Author',
    'AuthorList',
#     'Chemical',
#     'ChemicalList',
#     'CitationSubset',
    'CollectiveName',
#     'CommentsCorrections',
#     'CommentsCorrectionsList',
#     'CopyrightInformation',
    'Country',
#     'DataBank',
#     'DataBankList',
#     'DataBankName',
#     'DateCompleted',
#     'DateCreated',
#     'DateRevised',
    'Day',
    'DescriptorName',
#     'ELocationID', # only PMID
    'ForeName',
#     'GeneSymbol',
#     'GeneSymbolList',
#     'GeneralNote', # noise
    'Grant',
    'GrantID',
    'GrantList',
#     'History',
#     'Hour',
#     'ISOAbbreviation',
#     'ISSN',
#     'ISSNLinking',
    'Identifier',
    'Initials',
#     'Investigator', # Not author
#     'InvestigatorList',
    'Issue',
    'Journal',
    'JournalIssue',
    'Keyword',
    'KeywordList',
    'Language',
    'LastName',
    'MedlineCitation',
#     'MedlineDate',
#     'MedlineJournalInfo', # ?
#     'MedlinePgn',
#     'MedlineTA',
    'MeshHeading',
    'MeshHeadingList',
#     'Minute',
    'Month',
#     'NameOfSubstance',
#     'NlmUniqueID',
#     'Note',
    'NumberOfReferences',
#     'OtherAbstract',
#     'OtherID',
    'PMID',
#     'Pagination',
#     'PersonalNameSubject',
#     'PersonalNameSubjectList',
    'PubDate',
#     'PubMedPubDate',
#     'PublicationStatus',
    'PublicationType',
    'PublicationTypeList',
    'PubmedArticle',
    'PubmedArticleSet',
#     'PubmedData',
    'QualifierName',
#     'RefSource',
#     'RegistryNumber',
#     'Season',
#     'SpaceFlightMission',
    'Suffix',
#     'SupplMeshList',
#     'SupplMeshName',
    'Title',
#     'VernacularTitle',
    'Volume',
    'Year',
}