In [1]:
%reload_ext autoreload
%autoreload 3

from owlready2 import get_ontology
from pathlib import Path
from dotenv import load_dotenv; load_dotenv()  # to load NCBI_API_KEY env variable
import pandas as pd
import re
from tqdm import tqdm
from IPython.display import clear_output

from src.cogtext.datasets.pubmed import convert_xml_to_csv

In [3]:
# collect data for the following categories
CATEGORIES = ['CognitiveTask', 'CognitiveConstruct']

OUTPUT_PATH = 'data/pubmed/abstracts_2023.csv.gz'

ONTOLOGY = get_ontology('data/ontologies/efo.owl').load()

In [3]:
all_queries = {}

for category in CATEGORIES:
  category_queries = {e.name:str(e.pubmedQuery[0]).replace('"', '\\"')
                      for e in ONTOLOGY[category].descendants()
                      if len(e.pubmedQuery) > 0}
  all_queries[category] = category_queries
  print(f'EF ontology contains {len(category_queries)} PubMed queries for {category}s.')

EF ontology contains 126 PubMed queries for CognitiveTasks.
EF ontology contains 72 PubMed queries for CognitiveConstructs.


In [13]:
import re

def convert_to_csv(category, subcategory, pubmed_file):

    with open(pubmed_file, 'r') as f:
        docs = f.read().replace('\n      ',' ').split('\n\n')
    
    records = []

    for doc in docs:
    # for doc in docs:
        doc = doc + '\n'
        if 'AB  - ' not in doc:
            abstract = re.search(r'OAB\s-\s(.*)\n', doc).group(1)
        else:
            abstract = re.search(r'AB\s\s-\s(.*)\n', doc).group(1)
        if 'PT  - Book' in doc:
            title = re.search(r'BTI\s-\s(.*)\n', doc).group(1)
            journal_iso = None
            journal_title = None
        else:
            title = re.search(r'TI\s\s-\s(.*)\n', doc).group(1)
            try:
                journal_title = re.search(r'JT\s\s-\s(.*)\n', doc).group(1)
                journal_iso = re.search(r'TA\s\s-\s(.*)\n', doc).group(1)
            except:
                journal_iso = None
                journal_title = None                

        pmid = re.search(r'PMID-\s(.*)\n', doc).group(1)
        year = re.search(r'DP\s\s-\s(\d{4}).*\n', doc).group(1)
        if '[doi]' in doc:
            try:
                doi = re.search(r'(AID|LID)\s-\s(.*)\[doi\]', doc, re.DOTALL).group(2)
            except:
                print(doc)
        else:
            doi = None
        records.append((pmid, doi, year, title, journal_title, journal_iso, abstract))
    # print(a.group(0))

    records = pd.DataFrame(records, columns=['pmid', 'doi', 'year', 'title', 'journal', 'journal_iso_abbreviation', 'abstract'])
    records['category'] = category
    records['subcategory'] = subcategory

    records.to_csv(f'data/pubmed/{category}/{subcategory}.csv', index=False)

convert_to_csv('CognitiveConstruct', 'Attention', 'data/pubmed/.cache/Attention.txt')

In [14]:
from joblib import Parallel, delayed

import os
import subprocess
import shlex
import xmltodict

def pubmed_pipeline(category, subcategory, query, overwrite=True,
                    edirect_dir='/Users/morteza/edirect'):
    subcategory = subcategory.replace('/', '')
    fname = Path('data/pubmed/.cache') / (subcategory + '.txt')
    fname.parent.mkdir(parents=True, exist_ok=True)

    if overwrite or not fname.exists():
        edirect_env = os.environ.copy()
        edirect_env['PATH'] = f"{edirect_dir}:{edirect_env['PATH']}"

        esearch_cmd = f'esearch -db pubmed -query "{query}" -pub abstract'
        efetch_cmd = f'efetch -format medline'

        esearch = subprocess.Popen(shlex.split(esearch_cmd),
                                   stdout=subprocess.PIPE, env=edirect_env)
        out, err = esearch.communicate()

        n_results = xmltodict.parse(out)['ENTREZ_DIRECT']['Count']
        if n_results == '0':
            print(f'[EDirect] No results for {subcategory}')
            return

        print(f'[EDirect] Fetching {n_results} articles for {subcategory}...')

        efetch = subprocess.Popen(shlex.split(efetch_cmd),
                                  stdin=subprocess.PIPE,
                                  stdout=subprocess.PIPE, env=edirect_env)

        out, err = efetch.communicate(input=out)

        with open(fname, 'wb') as f:
            f.write(out)

    print(f'[EDirect] Finished {subcategory}')
    try:
        convert_to_csv(category, subcategory, fname)
    except Exception as e:
        print(f'[CSV] Failed to convert {subcategory}.')
        raise e

for category, queries in all_queries.items():

    Path(f'data/pubmed/{category}/').mkdir(parents=True, exist_ok=True)

    jobs = [delayed(pubmed_pipeline)(category, subcategory, query, overwrite=False)
            for subcategory, query in queries.items()
    ]

    Parallel(n_jobs=7)(jobs)

clear_output()
print('Done!')

Done!


In [4]:
# aggregation

corpus_files = Path('data/pubmed/').glob('**/*.csv')

dfs = []

for fname in tqdm(corpus_files):

  # find categories from the file name
  cats = re.findall('.*/pubmed/(.*)/(.*)\\.csv', str(fname))

  # ignore other csv files
  if len(cats) == 0:
    continue

  category = cats[0][0]
  subcategory = cats[0][1]

  df = pd.read_csv(fname)
  df['category'] = category
  df['subcategory'] = subcategory
  dfs.append(df)

# now aggregate all the data and store the compressed csv output (takes ~ 2min).
pd.concat(dfs).to_csv(OUTPUT_PATH, index=False, compression='gzip')

174it [00:28,  6.08it/s]


In [5]:
pubmed2021 = pd.read_csv('data/pubmed/abstracts_2021.csv.gz')
pubmed2021.shape

(531748, 9)

In [6]:
pubmed2023 = pd.read_csv('data/pubmed/abstracts_2023.csv.gz')
pubmed2023.shape

(635601, 9)