In [2]:
from rdflib import OWL, Graph
from rdflib.namespace import RDFS
from owlready2 import get_ontology, default_world
import xmltodict
import time
from rdflib import URIRef
import random
import pandas as pd
import os, sys

# one liner to import cogtext package from `../python` folder.
if '../python' not in sys.path: sys.path.append('../python'); from cogtext import *




In [3]:
owl_file = "../data/ontologies/efo.owl"
owl_prefix = "http://www.semanticweb.org/morteza/ontologies/2019/11/executive-functions-ontology#"

ONTOLOGY = get_ontology(owl_file).load()
GRAPH = default_world.as_rdflib_graph()

def query(graph, parent_cls='Task'):
    """Function to query tasks, constructs, regions, etc.

    ## Returns
    A list of labels
    """

    cls_name = parent_cls[1:] if parent_cls.startswith(":") else parent_cls

    query = f"""
    prefix : <{owl_prefix}>

    SELECT ?label ?pubmed_query
    WHERE {{
    ?task rdfs:subClassOf* :{cls_name};
          :pubmedQuery ?pubmed_query;
          rdfs:label ?label
    }}
    """

    # select the all rdfs:labels, flatten the list of labels, and convert them to python string
    labels = [labels for labels in graph.query(query)]
    pubmed_queries = {l[0].toPython(): l[1].toPython() for l in labels}
    return pubmed_queries


In [4]:
pubmed_queries = query(GRAPH, 'BaggettaTask')   # dictionary of task_name -> pubmed_query

print(f'{len(pubmed_queries)} tasks found in the EF ontology (baggetta2016)')

106 tasks found in the EF ontology (baggetta2016)


In [5]:
# from pybliometrics.scopus import ScopusSearch
# s = ScopusSearch('Digit Span')
# print(s)
# fetch = PubMedFetcher(cachedir='../data/cache/pubmed/')

# search and cache hits

Let's search PubMed for each task query (`ef:pubmedQuery`), then cleanup the XML results into a CSV with the following columns: abstract, title, and a reference to the metadata (pmid).


In [42]:
for task_label, pubmed_query in pubmed_queries.items():
    task_label = task_label.replace('/','') 
    fname = Path('../data/cache') / (task_label + '.xml')

    if not fname.exists():
        search_and_store(pubmed_query, fname)


def find_mesh(mesh_list):
    """Extracts MeSH names from a list of XML MedlineCitation.MeshHeadingList.MeshHeading tags."""
    if not isinstance(mesh_list, list):
        return []

    mesh_names = [h['DescriptorName']['#text'] for h in mesh_list]# if d['DescriptorName']['@MajorTopicYN'] == 'Y']
    return mesh_names


# now cleanup and convert all abstracts into CSV files
for task_label in pubmed_queries.keys():
    print(f'[XML2CSV] converting "{task_label}" dataset...')

    task_label = task_label.replace('/','')

    xml_file = Path('../data/cache') / (task_label + '.xml')
    csv_file = Path('../data/pubmed') / (task_label + '.csv')

    with open(xml_file, 'r') as f:
        xml_content = xmltodict.parse(f.read())
        if 'PubmedArticleSet' in xml_content:

            df = pd.json_normalize(xml_content['PubmedArticleSet']['PubmedArticle'])

            # pmid, title, and abstract
            df['pmid'] = df['MedlineCitation.PMID.#text']
            df['title'] = df['MedlineCitation.Article.ArticleTitle']
            df['abstract'] = df['MedlineCitation.Article.Abstract.AbstractText'].apply(cleanup_abstract)
            
            # publication year
            df['year'] = df['MedlineCitation.Article.Journal.JournalIssue.PubDate.Year']

            # MeSh topics (some datasets do not contain MeshHeading, e.g., Spin The Pots)
            if 'MedlineCitation.MeshHeadingList.MeshHeading' in df.columns:
                df['mesh'] = df['MedlineCitation.MeshHeadingList.MeshHeading'].apply(find_mesh)
            else:
                df['mesh'] = None

            # fill missing abstracts with #text value
            if 'MedlineCitation.Article.Abstract.AbstractText.#text' in df.columns:
                df['abstract'].fillna(df['MedlineCitation.Article.Abstract.AbstractText.#text'], inplace=True)

            if 'MedlineCitation.Article.ArticleTitle.#text' in df.columns:
                df['title'].fillna(df['MedlineCitation.Article.ArticleTitle.#text'], inplace=True)

            df[['pmid', 'year', 'title','abstract', 'mesh']].to_csv(csv_file, index=False)

print('Done!')

[XML2CSV] converting "Analogy Making Task" dataset...
[XML2CSV] converting "Animal Shifting" dataset...
[XML2CSV] converting "Anti-Saccade Task" dataset...
[XML2CSV] converting "ANT (Attention Network Test)" dataset...
[XML2CSV] converting "Auditory Attention" dataset...
[XML2CSV] converting "AOSpan (Automated Operation Span Task)" dataset...
[XML2CSV] converting "Backward Span Task" dataset...
[XML2CSV] converting "Backward Color Recall Task" dataset...
[XML2CSV] converting "Balance Beam Task" dataset...
[XML2CSV] converting "Bear/Aligator Task" dataset...
[XML2CSV] converting "Block Design Subtest" dataset...
[XML2CSV] converting "Block Span" dataset...
[XML2CSV] converting "Box Crossing Dual Task" dataset...
[XML2CSV] converting "Boxes Task" dataset...
[XML2CSV] converting "CATT (Controlled Attention Task)" dataset...
[XML2CSV] converting "CNT (Contingency Naming Task)" dataset...
[XML2CSV] converting "CPT (Continuous Performance Task)" dataset...
[XML2CSV] converting "Category Flue

## Data cleansing

Here, we aim to preprocess PubMed corpora and keep only those relevant metadata. Outputs of this pipeline are stored in the `data/pubmed` folder as csv files; one csv per task corpus.

The following metadata will be stored in the processed csv files:

- pmid: unique PubMed identifier of the article.
- title: escaped title in string format.
- abstract: escaped and cleanedup abstract in string format.
- year: publication year in YYYY format.
- mesh: A list of Medical Subject Headings which contains the field of research and other topics. We only keep major topics.