# search and cache hits

This notebook searches PubMed for all the defined queries in the ontology (i.e., properties of type `efo:pubmedQuery`),cleanups and aggregates the XML results, and stores all the search hits in a single CSV.

**NOTE**: notebook should be executed from the project root folder



## Output

Aggregated abstracts are stored as a single compressed file: `data/pubmed_abstracts.csv.gz`. This output file contains the following features:

- `category` (`str`): either `task` or `construct`
- `subcategory` (`str`): task or construct name
- `pmid` (`long`): PubMed Identifier
- `doi` (`str`): DOI
- `year` (`int`): publication year in `yyyy` format
- `title` (`str`): publication title
- `abstract` (`str`): publication abstract
- `journal_title` (`str`): full journal title
- `journal_iso_abbreviation` (`str`): Abbreviated journal title
- `mesh` (`str`, deprecated): A list of Medical Subject Headings which indicates the field of research and other topics. We only keep major topics.

In [1]:
# install requirements
!pip install owlready2 pandas dask[dataframe]

# if you want to perform inference on the ontology (e.g., using hermit), then make sure to install java.

zsh:1: no matches found: dask[dataframe]


In [2]:
from owlready2 import get_ontology
import xmltodict
import pandas as pd
from pathlib import Path

from python.cogtext import cleanup_abstract, search_and_store, extract_doi, parse_publication_year



In [3]:
DEBUG = True

# collect data for the following categories
CATEGORIES = ['CognitiveTask', 'CognitiveConstruct']

OUTPUT_PATH = 'data/pubmed_abstracts.csv.gz'

OWL_FILE = 'data/ontologies/efo.owl'
ONTOLOGY = get_ontology(OWL_FILE).load()

In [4]:
for category in CATEGORIES:
  pubmed_queries = {e.name:e.pubmedQuery[0] for e in ONTOLOGY[category].descendants() if len(e.pubmedQuery) > 0}
  print(f'EF ontology contains {len(pubmed_queries)} PubMed queries for {category}s.')

EF ontology contains 126 PubMed queries for CognitiveTasks.
EF ontology contains 72 PubMed queries for CognitiveConstructs.


In [5]:
def search_and_cache_xml(pubmed_queries, overwrite_existing=False):
    for subcategory, pubmed_query in pubmed_queries.items():
        subcategory = subcategory.replace('/','') 
        fname = Path('data/pubmed/.cache') / (subcategory + '.xml')

        if overwrite_existing or not fname.exists():
            search_and_store(pubmed_query, fname)


def convert_xml_to_csv(pubmed_queries, category, overwrite_existing=False):
    """cleanup and convert all abstracts into CSV files"""

    for subcategory in sorted(pubmed_queries.keys()):

        subcategory_fname = subcategory.replace('/','')

        xml_file = Path('data/pubmed/.cache') / (subcategory_fname + '.xml')
        csv_file = Path(f'data/pubmed/{category}') / (subcategory_fname + '.csv')

        if xml_file.exists() and (overwrite_existing or not csv_file.exists()):
            with open(xml_file, 'r') as f:

                if DEBUG:
                    print(f'[XML2CSV] converting "{category}/{subcategory}" dataset...')

                xml_content = xmltodict.parse(f.read())

                if 'PubmedArticleSet' in xml_content:

                    df = pd.json_normalize(xml_content['PubmedArticleSet']['PubmedArticle'])

                    # pmid, doi, title, and abstract
                    df['pmid'] = df['MedlineCitation.PMID.#text']
                    df['doi'] = df['PubmedData.ArticleIdList.ArticleId'].apply(extract_doi)
                    df['title'] = df['MedlineCitation.Article.ArticleTitle']
                    df['abstract'] = df['MedlineCitation.Article.Abstract.AbstractText'].apply(cleanup_abstract)
                    
                    # publication years
                    df['year'] = df['MedlineCitation.Article.Journal.JournalIssue.PubDate.Year']
                    df['journal_title'] = df['MedlineCitation.Article.Journal.Title']
                    df['journal_iso_abbreviation'] = df['MedlineCitation.Article.Journal.ISOAbbreviation']

                    # MeSh topics (some datasets do not contain MeshHeading, e.g., Spin The Pots)
                    # if 'MedlineCitation.MeshHeadingList.MeshHeading' in df.columns:
                    #     df['mesh'] = df['MedlineCitation.MeshHeadingList.MeshHeading'].apply(find_mesh)
                    # else:
                    #     df['mesh'] = None

                    if 'MedlineCitation.Article.Journal.JournalIssue.PubDate.MedlineDate' in df.columns:
                        medline_year = df['MedlineCitation.Article.Journal.JournalIssue.PubDate.MedlineDate'].apply(parse_publication_year)
                        df['year'].fillna(medline_year, inplace=True)

                    df['year'] = df['year'].apply(int)

                    # fill missing abstracts with #text value
                    if 'MedlineCitation.Article.Abstract.AbstractText.#text' in df.columns:
                        df['abstract'].fillna(df['MedlineCitation.Article.Abstract.AbstractText.#text'], inplace=True)

                    if 'MedlineCitation.Article.ArticleTitle.#text' in df.columns:
                        df['title'].fillna(df['MedlineCitation.Article.ArticleTitle.#text'], inplace=True)

                    # workaround to discard unusual terminators in the text
                    df['abstract'] = df['abstract'].apply(lambda x: x.replace('\u2029', ' ') if isinstance(x, str) else x)
                    df['title'] = df['title'].apply(lambda x: x.replace('\u2029', ' ') if isinstance(x, str) else x)

                    df[['pmid', 'doi', 'year', 'journal_title', 'journal_iso_abbreviation', 'title','abstract']].to_csv(csv_file, index=False)

In [6]:

for category in CATEGORIES:

    # init folder
    Path(f'data/pubmed/{category}').mkdir(parents=True, exist_ok=True)

    # fetch queries from the ontology
    pubmed_queries = {e.name: e.pubmedQuery[0] for e in ONTOLOGY[category].descendants() if len(e.pubmedQuery) > 0}

    search_and_cache_xml(pubmed_queries)
    convert_xml_to_csv(pubmed_queries, category)

print('Done!')

[PubMed] query: ("Oddity Switch"[TIAB])
[PubMed] no article found.
[PubMed] query: ("Plus-Minus task"[TIAB]) OR ("Plus-Minus test"[TIAB])
[PubMed] no article found.
[PubMed] query: ("Mister X task"[TIAB])
[PubMed] no article found.
[PubMed] query: ("Delay Alteration task"[TIAB]) OR ("Delay Alteration test"[TIAB])
[PubMed] no article found.
[PubMed] query: ("Bear-Alligator task"[TIAB])
[PubMed] no article found.
[PubMed] query: ("Listening Recall"[TIAB])
[PubMed] no article found.
[PubMed] query: ("Pick the Picture game"[TIAB])
[PubMed] no article found.
[PubMed] query: ("Luria Hand Game"[TIAB])
[PubMed] no article found.
[PubMed] query: ("Nebraska Barnyard"[TIAB])
[PubMed] no article found.
[PubMed] query: ("matrix span"[TIAB])
[PubMed] no article found.
[PubMed] query: ("Digit-Shifting task"[TIAB])
[PubMed] no article found.
[PubMed] query: ("Expressive Attention task"[TIAB])
[PubMed] no article found.
[PubMed] query: ("Spatial Conflict Arrows"[TIAB])
[PubMed] no article found.
[PubMe

In [8]:
# aggregation
import dask.dataframe as dd
import re
from tqdm import tqdm

corpus_files = Path('data/pubmed/').glob('**/*.csv')

dfs = []

for fname in tqdm(corpus_files):

  # find categories from the file name
  cats = re.findall('.*/pubmed/(.*)/(.*)\\.csv', str(fname))

  # ignore other csv files
  if len(cats) == 0:
    continue

  category = cats[0][0]
  subcategory = cats[0][1]

  df = dd.read_csv(fname)
  df['category'] = category
  df['subcategory'] = subcategory
  dfs.append(df)

# now aggregate all the corpora and store the compressed csv output (takez 85s on MacBook Pro 2018).
dd.concat(dfs).to_csv(OUTPUT_PATH, single_file=True, index=False, compression='gzip')

173it [00:02, 62.52it/s]


['/Users/morteza/workspace/cogtext/data/pubmed_abstracts.csv.gz']