In [1]:
from rdflib import OWL, Graph
from rdflib.namespace import RDFS
from owlready2 import get_ontology, default_world
import xmltodict
from collections import OrderedDict 
import time
from rdflib import URIRef
import random
import pandas as pd
import os, sys

# one liner to import cogtext package from `../python` folder.
if '../python' not in sys.path: sys.path.append('../python'); from cogtext import *




In [2]:
owl_file = "../data/ontologies/efo.owl"
owl_prefix = "http://www.semanticweb.org/morteza/ontologies/2019/11/executive-functions-ontology#"

ONTOLOGY = get_ontology(owl_file).load()
GRAPH = default_world.as_rdflib_graph()

def query(graph, parent_cls='Task'):
    """Function to query tasks, constructs, regions, etc.

    ## Returns
    A list of labels
    """

    cls_name = parent_cls[1:] if parent_cls.startswith(":") else parent_cls

    query = f"""
    prefix : <{owl_prefix}>

    SELECT ?label
    WHERE {{
    ?task rdfs:subClassOf* :{cls_name};
            rdfs:label ?label
    }}
    """

    # select the all rdfs:labels, flatten the list of labels, and convert them to python string
    labels = [labels for labels in graph.query(query)]
    flatten_labels = [l.toPython() for ll in labels for l in ll]
    return flatten_labels


In [3]:
task_terms = query(GRAPH, 'CognitiveTask')

print(f'{len(task_terms)} tasks found in the ontology (enkavi, baggetta, dimond, etc).')

134 tasks found in the ontology (enkavi, baggetta, dimond, etc).


The following cell uses a custom-made PubMed client to search, cache and fetch results. This client uses NCBI history API to make searching scalable, so it is not limitation on the number of results.

In [4]:
# from pybliometrics.scopus import ScopusSearch
# s = ScopusSearch('Digit Span')
# print(s)
# fetch = PubMedFetcher(cachedir='../data/cache/pubmed/')

# search and cache results!
i
for term in task_terms:
    term = term.lower()
    # to avoid '/' in terms turning into path separator
    fpath = Path('../data/cache') / (term.replace('/','') + '.xml')
    if not fpath.exists():
        t0 = 
        search_and_store(term.lower(), fpath, suffix=['task', 'test'])

Now, let's cleanup the XML mess and keep only abstract, title, and some references to the origianl data.

In [3]:
def cleanup_abstract(abstract_text):
    """PubMed returns abstract with semantic tags. This function cleans those tags and keep the text."""
    select_content = lambda c: c if isinstance(c ,str) else c['#text'] if (c is not None and ('#text' in c)) else ''

    if isinstance(abstract_text, list):
        return ' '.join([select_content(a) for a in abstract_text])
    elif isinstance(abstract_text, OrderedDict):
        return select_content(abstract_text)
    return abstract_text    # when the abstract is string


# now cleanup and convert all abstracts into CSV files
for term in task_terms:
    print(f'converting "{term}" dataset...')

    in_path = Path('../data/cache') / (term.replace('/','') + '.xml')
    out_path = Path('../data/pubmed') / (term.replace('/','') + '.csv')

    with open(in_path, 'r') as f:
        cache = xmltodict.parse(f.read())
        if 'PubmedArticleSet' in cache:

            df = pd.json_normalize(cache['PubmedArticleSet']['PubmedArticle'])

            # df[df['MedlineCitation.PMID.#text'] == "33686858"]['MedlineCitation.Article.Abstract.AbstractText']
            df['abstract'] = df['MedlineCitation.Article.Abstract.AbstractText'].apply(cleanup_abstract)
            df['pmid'] = df['MedlineCitation.PMID.#text']
            df['title'] = df['MedlineCitation.Article.ArticleTitle']

            # fill missing abstracts with #text
            if 'MedlineCitation.Article.Abstract.AbstractText.#text' in df.columns:
                df['abstract'].fillna(df['MedlineCitation.Article.Abstract.AbstractText.#text'], inplace=True)

            if 'MedlineCitation.Article.ArticleTitle.#text' in df.columns:
                df['title'].fillna(df['MedlineCitation.Article.ArticleTitle.#text'], inplace=True)

            df[['title','abstract','pmid']].to_csv(out_path, index=False)


print('Done!')

converting "Cognitive Control" dataset...
converting "Executive Function" dataset...
Done!
