# Data Collection (2023)

This is the code we used for our 2023 data collection. It retrieves PubMed abstracts using EDirect. For 2021 data collection, see [Data Collection (2021) notebook](./1%20Data%20Collection%20(2021).ipynb).

The code is divided into 3 parts:
1. Load the EFO ontology and extract all task and construct names and queries.
2. Retrieving articles from PubMed in MEDLINE format.
3. Parse the MEDLINE files and from each extract the abstract, year, DOI, PMID, and journal title. The output dataset is stored in `data/pubmed/abstracts_2023.csv.gz`.

## Requirements

-  Before running the notebook, make sure you have EDirect installed. You can find instructions on how to install it here: https://www.ncbi.nlm.nih.gov/books/NBK179288/

- You also need to set the `NCBI_API_KEY` environment variable to your NCBI API key. You can get one here: https://www.ncbi.nlm.nih.gov/account/settings/

- This notebook uses task and construct names from the EFO ontology. Make sure you have the latest version of the ontology in the `data/ontologies/efo.owl` path. Download the ontology from here: https://huggingface.co/datasets/morteza/cogtext/blob/main/ontologies/efo.owl and put the OWL file in the `data/ontologies/` directory.

- Activate the `cogtext` conda environment: `mamba activate cogtext`

In [1]:
%reload_ext autoreload
%autoreload 3

import os
import re
import shlex
import subprocess
from pathlib import Path

import pandas as pd
import xmltodict
from IPython.display import clear_output
from joblib import Parallel, delayed
from owlready2 import get_ontology
from tqdm import tqdm

In [3]:
# collect data for the following categories
CATEGORIES = ['CognitiveTask', 'CognitiveConstruct']

# load the EFO ontology
ONTOLOGY = get_ontology('data/ontologies/efo.owl').load()

OUTPUT_PATH = 'data/pubmed/abstracts_2023.csv.gz'

In [3]:
all_queries = {}

for category in CATEGORIES:
  category_queries = {e.name:str(e.pubmedQuery[0]).replace('"', '\\"')
                      for e in ONTOLOGY[category].descendants()
                      if len(e.pubmedQuery) > 0}
  all_queries[category] = category_queries
  print(f'EF ontology contains {len(category_queries)} PubMed queries for {category}s.')

EF ontology contains 126 PubMed queries for CognitiveTasks.
EF ontology contains 72 PubMed queries for CognitiveConstructs.


In [13]:
def convert_medline_to_csv(category, subcategory, pubmed_file):

    with open(pubmed_file, 'r') as f:
        docs = f.read().replace('\n      ',' ').split('\n\n')
    
    records = []

    for doc in docs:
    # for doc in docs:
        doc = doc + '\n'
        if 'AB  - ' not in doc:
            abstract = re.search(r'OAB\s-\s(.*)\n', doc).group(1)
        else:
            abstract = re.search(r'AB\s\s-\s(.*)\n', doc).group(1)
        if 'PT  - Book' in doc:
            title = re.search(r'BTI\s-\s(.*)\n', doc).group(1)
            journal_iso = None
            journal_title = None
        else:
            title = re.search(r'TI\s\s-\s(.*)\n', doc).group(1)
            try:
                journal_title = re.search(r'JT\s\s-\s(.*)\n', doc).group(1)
                journal_iso = re.search(r'TA\s\s-\s(.*)\n', doc).group(1)
            except:
                journal_iso = None
                journal_title = None                

        pmid = re.search(r'PMID-\s(.*)\n', doc).group(1)
        year = re.search(r'DP\s\s-\s(\d{4}).*\n', doc).group(1)
        if '[doi]' in doc:
            try:
                doi = re.search(r'(AID|LID)\s-\s(.*)\[doi\]', doc, re.DOTALL).group(2)
            except:
                print(doc)
        else:
            doi = None
        records.append((pmid, doi, year, title, journal_title, journal_iso, abstract))
    # print(a.group(0))

    records = pd.DataFrame(records, columns=['pmid',
                                             'doi', 'year', 'title', 'journal_title', 'journal_iso_abbreviation', 'abstract'])
    records['category'] = category
    records['subcategory'] = subcategory

    records.to_csv(f'data/pubmed/{category}/{subcategory}.csv', index=False)

In [14]:
def pubmed_pipeline(category, subcategory, query, overwrite=True,
                    edirect_dir='~/edirect'):
    subcategory = subcategory.replace('/', '')
    fname = Path('data/pubmed/.cache') / (subcategory + '.txt')
    fname.parent.mkdir(parents=True, exist_ok=True)

    if overwrite or not fname.exists():
        edirect_dir = os.path.expanduser(edirect_dir)

        # create a temporary environment with edirect in the PATH
        edirect_env = os.environ.copy()
        edirect_env['PATH'] = f"{edirect_dir}:{edirect_env['PATH']}"

        # define esearch and efetch commands
        esearch_cmd = f'esearch -db pubmed -query "{query}" -pub abstract'
        efetch_cmd = f'efetch -format medline'

        # run esearch to retrieve the list of PMIDs
        esearch = subprocess.Popen(shlex.split(esearch_cmd),
                                   stdout=subprocess.PIPE, env=edirect_env)
        out, err = esearch.communicate()

        n_results = xmltodict.parse(out)['ENTREZ_DIRECT']['Count']
        if n_results == '0':
            print(f'[EDirect] No results for {subcategory}')
            return

        print(f'[EDirect] Fetching {n_results} articles for {subcategory}...')

        # run efetch to retrieve the abstracts in medline format
        efetch = subprocess.Popen(shlex.split(efetch_cmd),
                                  stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE, env=edirect_env)
        out, err = efetch.communicate(input=out)

        # store the results in a file
        with open(fname, 'wb') as f:
            f.write(out)

    print(f'[EDirect] Finished {subcategory}')
    try:
        convert_medline_to_csv(category, subcategory, fname)
    except Exception as e:
        print(f'[CSV] Failed to convert {subcategory}.')
        raise e

# run the pipeline in parallel. Make sure n_jobs does not exceed the NCBI API limit.
for category, queries in all_queries.items():

    Path(f'data/pubmed/{category}/').mkdir(parents=True, exist_ok=True)

    jobs = [delayed(pubmed_pipeline)(category, subcategory, query, overwrite=False)
            for subcategory, query in queries.items()
    ]

    Parallel(n_jobs=7)(jobs)

clear_output()
print('Done!')

Done!


In [4]:
# aggregate all the data into a single file

corpus_files = Path('data/pubmed/').glob('**/*.csv')

dfs = []

for fname in tqdm(corpus_files):

  # find categories from the file name
  cats = re.findall('.*/pubmed/(.*)/(.*)\\.csv', str(fname))

  # ignore other csv files
  if len(cats) == 0:
    continue

  category = cats[0][0]
  subcategory = cats[0][1]

  # load the data and add category and subcategory columns
  df = pd.read_csv(fname)
  df['category'] = category
  df['subcategory'] = subcategory
  dfs.append(df)

# now aggregate all the data and store the compressed csv output (takes ~ 2min).
pd.concat(dfs).to_csv(OUTPUT_PATH, index=False, compression='gzip')

174it [00:28,  6.08it/s]


In [2]:
# report the number of abstracts per category
pubmed2021 = pd.read_csv('data/pubmed/abstracts_2021.csv.gz')
pubmed2023 = pd.read_csv('data/pubmed/abstracts_2023.csv.gz')
pubmed2021.shape, pubmed2023.shape


((531748, 9), (635601, 9))