In [1]:
from rdflib import OWL, Graph
from rdflib.namespace import RDFS
from owlready2 import get_ontology, default_world
import xmltodict
import time
from rdflib import URIRef
import random
import pandas as pd
import os, sys

# one liner to import cogtext package from `../python` folder.
#if '../python' not in sys.path: sys.path.append('../python');
from python.cogtext import *




In [2]:
OWL_FILE = 'data/ontologies/efo.owl'
OWL_PREFIX = 'http://xcit.org/ontologies/2021/executive-functions-ontology#'

ENTITY_TYPE = 'tests'  # 'tests' or 'constructs'

ONTOLOGY = get_ontology(OWL_FILE).load()
GRAPH = default_world.as_rdflib_graph()

def query(graph, parent_cls='CognitiveProcess'):
    """Function to query tests, constructs, regions, etc.

    ## Returns
    A list of labels
    """

    cls_name = parent_cls[1:] if parent_cls.startswith(":") else parent_cls

    sparql_query = f"""
    prefix : <{OWL_PREFIX}>

    SELECT ?label ?pubmed_query
    WHERE {{
    ?cls rdfs:subClassOf* :{cls_name};
          :pubmedQuery ?pubmed_query;
          rdfs:label ?label
    }}
    """

    # select the all rdfs:labels, flatten the list of labels, and convert them to python string
    labels = [labels for labels in graph.query(sparql_query)]
    pubmed_queries = {l[0].toPython(): l[1].toPython() for l in labels}
    return pubmed_queries


In [3]:
pubmed_queries = query(GRAPH, 'CognitiveTask' if ENTITY_TYPE=='tests' else 'CognitiveProcess')  
print(f'{len(pubmed_queries)} entities found in the EF ontology.')
# pubmed_queries

134 entities found in the EF ontology.


# search and cache hits

Let's search PubMed for each task query (`ef:pubmedQuery`), then cleanup the XML results into a CSV with the following columns: abstract, title, and a reference to the metadata (pmid).


In [10]:
for entity_label, pubmed_query in pubmed_queries.items():
    entity_label = entity_label.replace('/','') 
    fname = Path('data/pubmed/.cache') / (entity_label + '.xml')

    if not fname.exists():
        search_and_store(pubmed_query, fname)


def find_mesh(mesh_list):
    """Extracts MeSH names from a list of XML MedlineCitation.MeshHeadingList.MeshHeading tags."""
    if not isinstance(mesh_list, list):
        return []

    mesh_names = [h['DescriptorName']['#text'] for h in mesh_list]# if d['DescriptorName']['@MajorTopicYN'] == 'Y']
    return mesh_names

def extract_doi(ids):
    """Helper function to extact DOI from PubMed `PubmedData.ArticleIdList.ArticleId`."""
    if isinstance(ids,list):
        all_dois = [_id['#text'] for _id in ids if _id['@IdType'] == 'doi' and '#text' in _id.keys()]
        if len(all_dois) == 0:
            return None
        return all_dois[0]
    else:
        return None


# now cleanup and convert all abstracts into CSV files
for entity_label in sorted(pubmed_queries.keys()):

    entity_fname = entity_label.replace('/','')

    xml_file = Path('data/pubmed/.cache') / (entity_fname + '.xml')
    csv_file = Path(f'data/pubmed/{ENTITY_TYPE}') / (entity_fname + '.csv')

    if xml_file.exists() and not csv_file.exists():
        with open(xml_file, 'r') as f:
            xml_content = xmltodict.parse(f.read())
            if 'PubmedArticleSet' in xml_content:
                print(f'[XML2CSV] converting "{entity_label}" dataset...')

                df = pd.json_normalize(xml_content['PubmedArticleSet']['PubmedArticle'])

                # pmid, doi, title, and abstract
                df['pmid'] = df['MedlineCitation.PMID.#text']
                df['doi'] = df['PubmedData.ArticleIdList.ArticleId'].apply(extract_doi)
                df['title'] = df['MedlineCitation.Article.ArticleTitle']
                df['abstract'] = df['MedlineCitation.Article.Abstract.AbstractText'].apply(cleanup_abstract)
                
                # publication year
                df['year'] = df['MedlineCitation.Article.Journal.JournalIssue.PubDate.Year']
                df['journal_title'] = df['MedlineCitation.Article.Journal.Title']
                df['journal_iso_abbreviation'] = df['MedlineCitation.Article.Journal.ISOAbbreviation']

                # MeSh topics (some datasets do not contain MeshHeading, e.g., Spin The Pots)
                # if 'MedlineCitation.MeshHeadingList.MeshHeading' in df.columns:
                #     df['mesh'] = df['MedlineCitation.MeshHeadingList.MeshHeading'].apply(find_mesh)
                # else:
                #     df['mesh'] = None

                if 'MedlineCitation.Article.Journal.JournalIssue.PubDate.MedlineDate' in df.columns:
                    medline_year = df['MedlineCitation.Article.Journal.JournalIssue.PubDate.MedlineDate'].apply(lambda x: x[0:4] if isinstance(x, str) and len(x)>=4 else x)
                    df['year'].fillna(medline_year, inplace=True)

                # fill missing abstracts with #text value
                if 'MedlineCitation.Article.Abstract.AbstractText.#text' in df.columns:
                    df['abstract'].fillna(df['MedlineCitation.Article.Abstract.AbstractText.#text'], inplace=True)

                if 'MedlineCitation.Article.ArticleTitle.#text' in df.columns:
                    df['title'].fillna(df['MedlineCitation.Article.ArticleTitle.#text'], inplace=True)

                # workaround to discard unusual terminators in the text
                df['abstract'] = df['abstract'].apply(lambda x: x.replace('\u2029', ' ') if isinstance(x, str) else x)
                df['title'] = df['title'].apply(lambda x: x.replace('\u2029', ' ') if isinstance(x, str) else x)

                df[['pmid', 'doi', 'year', 'journal_title', 'journal_iso_abbreviation', 'title','abstract']].to_csv(csv_file, index=False)

print('Done!')

[PubMed] query: ("Animal Shifting"[TIAB])


KeyboardInterrupt: 

## Data cleansing

Here, we aimed to preprocess PubMed corpora and keep only those relevant metadata. Outputs of this pipeline are stored in the `data/pubmed` folder as csv files; one csv per corpus.

The following columns will be stored in the csv files:

- pmid: unique PubMed identifier of the article.
- doi: uniuqe DOI identifier.
- year: publication year in YYYY format.
- title: escaped title in string format.
- journal_title: Journal title.
- journal_iso_abbreviation: Journal ISO abbreviation.
- abstract: escaped and cleanedup abstract in string format.
- mesh: A list of Medical Subject Headings which contains the field of research and other topics. We only keep major topics.