In [1]:
from rdflib import OWL, Graph
from rdflib.namespace import RDFS
from owlready2 import get_ontology, default_world
import xmltodict
import time
from rdflib import URIRef
import random
import pandas as pd
import os, sys

# one liner to import cogtext package from `../python` folder.
if '../python' not in sys.path: sys.path.append('../python'); from cogtext import *




In [2]:
owl_file = "../data/ontologies/efo.owl"
owl_prefix = "http://www.semanticweb.org/morteza/ontologies/2019/11/executive-functions-ontology#"

ONTOLOGY = get_ontology(owl_file).load()
GRAPH = default_world.as_rdflib_graph()

def query(graph, parent_cls='Task'):
    """Function to query tasks, constructs, regions, etc.

    ## Returns
    A list of labels
    """

    cls_name = parent_cls[1:] if parent_cls.startswith(":") else parent_cls

    query = f"""
    prefix : <{owl_prefix}>

    SELECT ?label ?pubmed_query
    WHERE {{
    ?task rdfs:subClassOf* :{cls_name};
          :pubmedQuery ?pubmed_query;
          rdfs:label ?label
    }}
    """

    # select the all rdfs:labels, flatten the list of labels, and convert them to python string
    labels = [labels for labels in graph.query(query)]
    pubmed_queries = {l[0].toPython(): l[1].toPython() for l in labels}
    return pubmed_queries


In [3]:
pubmed_queries = query(GRAPH, 'BaggettaTask')   # dictionary of task_name -> pubmed_query

print(f'{len(pubmed_queries)} tasks found in the EF ontology (baggetta2016)')

106 tasks found in the EF ontology (baggetta2016)


In [4]:
# from pybliometrics.scopus import ScopusSearch
# s = ScopusSearch('Digit Span')
# print(s)
# fetch = PubMedFetcher(cachedir='../data/cache/pubmed/')

# search and cache hits

Let's search PubMed for each task query (`ef:pubmedQuery`), then cleanup the XML results into a CSV with the following columns: abstract, title, and a reference to the metadata (pmid).


In [11]:
for task_label, pubmed_query in pubmed_queries.items():
    task_label = task_label.replace('/','') 
    fname = Path('../data/cache') / (task_label + '.xml')

    if not fname.exists():
        search_and_store(pubmed_query, fname)


# now cleanup and convert all abstracts into CSV files
for task_label in pubmed_queries.keys():
    print(f'[XML2CSV] converting "{task_label}" dataset...')

    task_label = task_label.replace('/','')

    xml_file = Path('../data/cache') / (task_label + '.xml')
    csv_file = Path('../data/pubmed') / (task_label + '.csv')

    with open(xml_file, 'r') as f:
        xml_content = xmltodict.parse(f.read())
        if 'PubmedArticleSet' in xml_content:

            df = pd.json_normalize(xml_content['PubmedArticleSet']['PubmedArticle'])

            # df[df['MedlineCitation.PMID.#text'] == "33686858"]['MedlineCitation.Article.Abstract.AbstractText']
            df['abstract'] = df['MedlineCitation.Article.Abstract.AbstractText'].apply(cleanup_abstract)
            df['pmid'] = df['MedlineCitation.PMID.#text']
            df['title'] = df['MedlineCitation.Article.ArticleTitle']

            # fill missing abstracts with #text
            if 'MedlineCitation.Article.Abstract.AbstractText.#text' in df.columns:
                df['abstract'].fillna(df['MedlineCitation.Article.Abstract.AbstractText.#text'], inplace=True)

            if 'MedlineCitation.Article.ArticleTitle.#text' in df.columns:
                df['title'].fillna(df['MedlineCitation.Article.ArticleTitle.#text'], inplace=True)

            df[['pmid', 'title','abstract']].to_csv(csv_file, index=False)

print('Done!')

[PubMed] ("Analogy Making"[TIAB])
[PubMed] Succesfully stored 17 hits on NCBI history server.
[PubMed] Succesfully stored hits in ../data/cache/Analogy Making Task.xml.
[PubMed] ("Animal Shifting"[TIAB])
[PubMed] Succesfully stored 0 hits on NCBI history server.
[PubMed] Succesfully stored hits in ../data/cache/Animal Shifting.xml.
[PubMed] ("Anti-saccade task"[TIAB]) OR ("anti-saccade test"[TIAB])
[PubMed] Succesfully stored 90 hits on NCBI history server.
[PubMed] Succesfully stored hits in ../data/cache/Anti-Saccade Task.xml.
[PubMed] ("Attention Network test"[TIAB]) OR ("("Attention Network task"[TIAB])") OR ("ANT test"[TIAB]) OR ("ANT task"[TIAB])
[PubMed] Succesfully stored 422 hits on NCBI history server.
[PubMed] Succesfully stored hits in ../data/cache/ANT (Attention Network Test).xml.
[PubMed] ("Auditory Attention test"[TIAB]) OR ("Auditory Attention task"[TIAB])
[PubMed] Succesfully stored 42 hits on NCBI history server.
[PubMed] Succesfully stored hits in ../data/cache/Audi