In [87]:
%reload_ext autoreload
%autoreload 3

from owlready2 import get_ontology
from pathlib import Path
from dotenv import load_dotenv; load_dotenv()  # to load NCBI_API_KEY env variable
import pandas as pd
import re
from tqdm import tqdm
from IPython.display import clear_output

from src.cogtext.datasets.pubmed import convert_xml_to_csv

In [88]:
# collect data for the following categories
CATEGORIES = ['CognitiveTask', 'CognitiveConstruct']

OUTPUT_PATH = 'data/pubmed/abstracts_2023.csv.gz'

ONTOLOGY = get_ontology('data/ontologies/efo.owl').load()

In [89]:
all_queries = {}

for category in CATEGORIES:
  category_queries = {e.name:str(e.pubmedQuery[0]).replace('"', '\\"')
                      for e in ONTOLOGY[category].descendants()
                      if len(e.pubmedQuery) > 0}
  all_queries[category] = category_queries
  print(f'EF ontology contains {len(category_queries)} PubMed queries for {category}s.')

EF ontology contains 126 PubMed queries for CognitiveTasks.
EF ontology contains 72 PubMed queries for CognitiveConstructs.


In [93]:
import re

def convert_to_csv(category, subcategory, pubmed_file):

    with open(pubmed_file, 'r') as f:
        docs = f.read().replace('\n      ',' ').split('\n\n')
    
    records = []

    for doc in docs:
    # for doc in docs:
        if 'AB  - ' not in doc:
            abstract = re.search(r'OAB\s-\s(.*)\n', doc).group(1)
        else:
            abstract = re.search(r'AB\s\s-\s(.*)\n', doc).group(1)
        if 'PT  - Book' in doc:
            title = re.search(r'BTI\s-\s(.*)\n', doc).group(1)
            journal_iso = None
            journal_title = None
        else:
            title = re.search(r'TI\s\s-\s(.*)\n', doc).group(1)
            journal_title = re.search(r'JT\s\s-\s(.*)\n', doc).group(1)
            journal_iso = re.search(r'TA\s\s-\s(.*)\n', doc).group(1)

        pmid = re.search(r'PMID-\s(.*)\n', doc).group(1)
        year = re.search(r'DP\s\s-\s(\d{4}).*\n', doc).group(1)
        doi = None
        if '[doi]' in doc:
            doi = re.search(r'AID\s-\s(.*)\[doi\]', doc, re.DOTALL).group(1)
        records.append((pmid, doi, year, title, journal_title, journal_iso, abstract))
    # print(a.group(0))

    records = pd.DataFrame(records, columns=['pmid', 'doi', 'year', 'title', 'journal', 'journal_iso_abbreviation', 'abstract'])
    records['category'] = category
    records['subcategory'] = subcategory

    records.to_csv(f'data/pubmed/{category}/{subcategory}.csv', index=False)


In [94]:
from joblib import Parallel, delayed

import os
import subprocess
import shlex
import xmltodict

def pubmed_pipeline(category, subcategory, query, overwrite=True,
                    edirect_dir='/Users/morteza/edirect'):
    subcategory = subcategory.replace('/', '')
    fname = Path('data/pubmed/.cache') / (subcategory + '.txt')
    fname.parent.mkdir(parents=True, exist_ok=True)

    if overwrite or not fname.exists():
        edirect_env = os.environ.copy()
        edirect_env['PATH'] = f"{edirect_dir}:{edirect_env['PATH']}"

        esearch_cmd = f'esearch -db pubmed -query "{query}" -pub abstract'
        efetch_cmd = f'efetch -format medline'

        esearch = subprocess.Popen(shlex.split(esearch_cmd),
                                   stdout=subprocess.PIPE, env=edirect_env)
        out, err = esearch.communicate()

        n_results = xmltodict.parse(out)['ENTREZ_DIRECT']['Count']
        if n_results == '0':
            print(f'[EDirect] No results for {subcategory}.')
            return
        
        print(f'[EDirect] Fetching {n_results} articles...')

        efetch = subprocess.Popen(shlex.split(efetch_cmd),
                                  stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE, env=edirect_env)

        out, err = efetch.communicate(input=out)

        with open(fname, 'wb') as f:
            f.write(out)

    print(f'[EDirect] Finished {fname}.')
    try:
        convert_to_csv(category, subcategory, fname)
    except Exception as e:
        print(f'[CSV] Failed to convert {subcategory}.')
        raise e

for category, queries in all_queries.items():

    Path(f'data/pubmed/{category}/').mkdir(parents=True, exist_ok=True)

    jobs = [delayed(pubmed_pipeline)(category, subcategory, query, overwrite=False)
            for subcategory, query in queries.items()
    ]

    Parallel(n_jobs=7)(jobs)

clear_output()
print('Done!')

[EDirect] Finished data/pubmed/.cache/BART.txt.
[EDirect] Finished data/pubmed/.cache/OSpan_-_Operating_Span.txt.
[EDirect] Finished data/pubmed/.cache/TMT_-_Trail_Making_Task.txt.
[EDirect] Finished data/pubmed/.cache/ShapeSchool.txt.
[EDirect] Finished data/pubmed/.cache/OddOneOutTask.txt.
[EDirect] Finished data/pubmed/.cache/WCST_-_Wisconsin_Card_Sort_Test.txt.
[EDirect] Finished data/pubmed/.cache/TowerOfLondon.txt.
[EDirect] Finished data/pubmed/.cache/RunningSpan.txt.
[EDirect] Finished data/pubmed/.cache/GiftDelay.txt.
[EDirect] Finished data/pubmed/.cache/PVT_-_Psychomotor_Vigilance_task.txt.
[EDirect] Finished data/pubmed/.cache/DelayChoiceTask.txt.
[EDirect] Finished data/pubmed/.cache/BoxesTask.txt.
[EDirect] Finished data/pubmed/.cache/LMT_-_Letter_Memory_task.txt.
[EDirect] Finished data/pubmed/.cache/Sorting_task.txt.
[EDirect] Finished data/pubmed/.cache/StimSSS.txt.
[EDirect] Finished data/pubmed/.cache/DietaryDecisionsTask.txt.
[EDirect] Finished data/pubmed/.cache/Hi

..

[EDirect] Fetching 40 articles...
[EDirect] Fetching 139 articles...


.

[EDirect] Fetching 11967 articles...
[EDirect] Fetching 8472 articles...


.

[EDirect] Fetching 33445 articles...


...

[EDirect] Finished data/pubmed/.cache/Updating.txt.


..

[EDirect] Finished data/pubmed/.cache/RelationalReasoning.txt.


.

[EDirect] Fetching 961 articles...


..

[EDirect] Fetching 4633 articles...


......
.........

[EDirect] Finished data/pubmed/.cache/BehavioralRegulation.txt.


.

[EDirect] Fetching 94 articles...


.....

[EDirect] Finished data/pubmed/.cache/VisuospatialSketchpad.txt.


.

[EDirect] Fetching 9672 articles...


......
..
.......

[EDirect] Finished data/pubmed/.cache/ExecutiveControl.txt.


.

[EDirect] Fetching 3155 articles...


............

[EDirect] Finished data/pubmed/.cache/InhibitoryControl.txt.
[EDirect] Fetching 25 articles...


.

[EDirect] Finished data/pubmed/.cache/SelfMonitoring.txt.


.

[EDirect] Fetching 3634 articles...
[EDirect] Finished data/pubmed/.cache/HigherOrderExecutiveFunction.txt.
[EDirect] Fetching 28 articles...


.

[EDirect] Finished data/pubmed/.cache/DualTaskCoordination.txt.
[EDirect] Fetching 8682 articles...


../Users/morteza/edirect/join-into-groups-of: line 8: 94065 Killed: 9               xargs -n "$@" echo
     94066                       | sed 's/ /,/g'
     94068                       | grep '.'


KeyboardInterrupt: 