# Testing EBC

In [148]:
from collections import defaultdict
import numpy
import openai
import os
import json
import itertools
import pandas as pd
from os.path import abspath
from pathlib import Path
from wasabi import Printer
from tqdm import tqdm
import spacy
from dotenv import load_dotenv
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

In [6]:
msg = Printer()

In [7]:
load_dotenv() # load enviroment variables
openai.api_key = os.getenv("OPENAI_API_KEY")

In [8]:
home_dir = Path(abspath(''))
msg.info(f'home directory: {home_dir}')

aop_wiki_abstracts_path = home_dir.joinpath('article_data/aop_wiki_abstracts.jsonl')

[38;5;4mℹ home directory: /Users/lars/Documents/GitHub/ebc_test[0m


## Load data

In [9]:
aop_wiki_abstracts = []
with aop_wiki_abstracts_path.open('r') as file:
    for line in file:
        aop_wiki_abstracts.append(json.loads(line))

## Get shortest dependecy path between Entities

In [None]:
def get_ents_with_gpt(doc, model, prompt_path):
    pass

    @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
    def chatCompletion_with_backoff(**kwargs):
        return openai.ChatCompletion.create(**kwargs)
    
    def read_response():
        pass
    
    # loading prompt file
    with open(prompt_path, 'r') as file:
        prompt = file.read()
        prompt = prompt.split('\n')
        
    # Insert sentences into new prompt
    for indx, sent in enumerate(doc.sents):
    prompt.append(f'sentence {indx}:\n'+'"""\n'+f'{sent.text}\n'+'"""')
    
    # convert prompt back to string
    prompt = '\n'.join(prompt)
    
    # query gpt
    message=[
            {'role':'system','content':system_message},
            {'role':'user','content':prompt.format(text=abstract)}
        ]
    
    # Get response
    response = chatCompletion_with_backoff(
        model=engine,
        messages=message,
        temperature=temp,
        n=n
    )
    
    
    
    
    

In [88]:
def get_shortest_dependency_path(ent1, ent2):
    
    def go_up_tree(token):
        path=[token.text, token.dep_]
        if token == token.head:
            # print(0, token, token.head)
            return [token.text]
        else:
            # print(1, path, token.head)
            return path+go_up_tree(token.head)
        
    path1 = go_up_tree(ent1.root)
    path2 = go_up_tree(ent2.root)
    
    return path1 + path2[:-1][::-1]

In [141]:
nlp = spacy.load('en_tox')

docs = nlp.pipe([doc['abstract'] for doc in aop_wiki_abstracts if type(doc['abstract'])==str])
pmids = [doc['pmid'] for doc in aop_wiki_abstracts if type(doc['abstract'])==str]
ent_labels_of_interest = ('PHENOTYPE', 'COMPOUND')

data_dict = {
    'entity A':[],
    'entity B':[],
    'label A':[],
    'label B':[],
    'shortest dep path':[],
    'sentence':[],
    'sentence id':[],
    'pmid':[],
}

sentence_id = 0
for doc, pmid in tqdm(zip(docs,pmids), total=len(pmids)):
    for sent in doc.sents:
        
        # Is there a molecule and a phenotype in the sentence
        if not set(ent_labels_of_interest).issubset(set([ent.label_ for ent in sent.ents])):
            continue

        # get the shortest dependency path of all combinations of molecule and phenotype
        ents_of_interest = [ent for ent in sent.ents if ent.label_ in ent_labels_of_interest]
        mirrors = []
        for ent1, ent2 in itertools.product(ents_of_interest, ents_of_interest):
            
            ### Skips
            if ent1 == ent2: # skip if ent1 is the same as ent2
                continue
                
            if ent1.label == ent2.label: #skip if labels are the same
                continue
                
            if (ent1, ent2) in mirrors: #SKIP mirrors (ent1+ent2 vs ent2+ent1)
                continue
            
            # Store mirrors
            mirrors.append((ent2, ent1))
            
            # get dependency path
            shortest_dep_path = get_shortest_dependency_path(ent1, ent2) # Get shortest dependecy path
            ent1_root = shortest_dep_path.pop(0)
            ent2_root = shortest_dep_path.pop(-1)
            
            ### Save data 
            data_dict['entity A'].append(ent1.text)
            data_dict['entity B'].append(ent2.text)
            data_dict['label A'].append(ent1.label_)
            data_dict['label B'].append(ent2.label_)
            data_dict['shortest dep path'].append(str(shortest_dep_path))
            data_dict['sentence'].append(sent.text)
            data_dict['sentence id'].append(sentence_id)
            data_dict['pmid'].append(pmid)
            data_dict['entity A root'].append(ent1_root)
            data_dict['entity B root'].append(ent2_root)
        
        # Increment sentence id
        sentence_id = sentence_id + 1

100%|██████████| 4945/4945 [04:37<00:00, 17.80it/s]


##### Small analysis of dependency paths

In [146]:
print('sentences in which at least one molecule name and at least one phenotype are present', len(set(data_dict['sentence id'])))
print('total number of unique molecule-phenotype-sentence combinations', len(set([(entityA, entityB, sentence_id) for entityA, entityB, sentence_id in zip(data_dict['entity A'], data_dict['entity B'], data_dict['sentence id'])])))
print('total number of unique dependency paths (not accounting for mirrors):', len(set(data_dict['shortest dep path'])))

sentences in which at least one molecule name and at least one phenotype are present 5118
total number of unique molecule-phenotype-sentence combinations 11054
total number of unique dependency paths (not accounting for mirrors): 9542


In [149]:
df = pd.DataFrame.from_dict(data_dict)
df.head()

Unnamed: 0,entity A,entity B,label A,label B,shortest dep path,sentence,sentence id,pmid
0,dioxane,mortality,COMPOUND,PHENOTYPE,"['nmod', 'effect', 'nmod', 'interest', 'nmod',...",As a result of recent interest in the carcinog...,0,
1,Vildagliptin,performance,COMPOUND,PHENOTYPE,"['nsubj', 'improved', 'dobj']",Vildagliptin markedly improved the motor perfo...,1,25752913.0
2,Vildagliptin,reduction,COMPOUND,PHENOTYPE,"['nsubj', 'improved', 'dep', 'effects', 'acl:r...",Vildagliptin markedly improved the motor perfo...,1,25752913.0
3,product,vildagliptin,PHENOTYPE,COMPOUND,"['nmod', 'Normalization', 'nsubj', 'finding', ...",Normalization of receptor for advanced glycate...,2,25752913.0
4,vildagliptin,molecule-1,COMPOUND,PHENOTYPE,"['nmod', 'effects', 'dobj', 'justifies', 'acl:...",Normalization of receptor for advanced glycate...,2,25752913.0


## Create sparse matrix

## Run EBC

## Analyse results