# Testing EBC

In [1]:
import numpy
import os
import json
from os.path import abspath
from pathlib import Path
from wasabi import Printer
from metapub import PubMedFetcher
from dotenv import load_dotenv
from tqdm import tqdm
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff



In [2]:
msg = Printer()

In [3]:
load_dotenv()
os.environ['NCBI_API_KEY'] = os.getenv('NCBI_API_KEY')

In [4]:
home_dir = Path(abspath(''))
msg.info(f'home directory: {home_dir}')

aop_wiki_parse_path = home_dir.joinpath('article_data/aop_wiki_references.jsonl')
aop_wiki_abstracts_path = home_dir.joinpath('article_data/aop_wiki_abstracts.jsonl')

[38;5;4mℹ home directory: /Users/lars/Documents/GitHub/ebc_test[0m


## Get a test corpus
for this test the abstracts from a parse of the AOP wiki is used

In [5]:
# Load data
aop_wiki_parse = []
with aop_wiki_parse_path.open('r') as file:
    for line in file:
        aop_wiki_parse.append(json.loads(line))

In [6]:
fetch = PubMedFetcher()

In [7]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def fetch_with_backoff(**kwargs):
    return fetch.article_by_pmid(**kwargs)

In [11]:
# Retrieving abstracts from NCBI
aop_wiki_abstracts = []
for doc in tqdm(aop_wiki_parse):
    pmids = doc['pmids']
    articles = [fetch_with_backoff(pmid=pmid) for pmid in pmids if pmid != None]

    # Store data
    for pmid, article in zip(pmids, articles):

        
        aop_wiki_abstracts.append({
            'pmid':pmid,
            'title':article.title,
            'abstract':article.abstract,
            'aop_wiki_id':doc['id'],
            'aop_title':doc['title']
        })

100%|██████████| 159/159 [1:33:19<00:00, 35.22s/it]   


In [14]:
# Save results
with aop_wiki_abstracts_path.open('w') as file:
    for doc in aop_wiki_abstracts:
        json.dump(doc, file)
        file.write('\n')


## Get Named Entities

## Get shortest dependecy path between Entities

## Create sparse matrix

## Run EBC

## Analyse results