# Testing EBC

In [32]:
import numpy
import os
import json
import itertools
import pandas as pd
from os.path import abspath
from pathlib import Path
from wasabi import Printer
from tqdm import tqdm
import spacy
from dotenv import load_dotenv
from ebc2d import EBC2D
import ebc2d
from collections import defaultdict

In [2]:
msg = Printer()

In [3]:
home_dir = Path(abspath(''))
msg.info(f'home directory: {home_dir}')

aop_wiki_abstracts_path = home_dir.joinpath('article_data/aop_wiki_abstracts.jsonl')

[38;5;4mℹ home directory: /Users/lars/Documents/GitHub/ebc_test[0m


## Load data

In [4]:
aop_wiki_abstracts = []
with aop_wiki_abstracts_path.open('r') as file:
    for line in file:
        aop_wiki_abstracts.append(json.loads(line))

## Get shortest dependecy path between Entities

In [5]:
def get_shortest_dependency_path(ent1, ent2):
    
    def go_up_tree(token):
        path=[token.text, token.dep_]
        if token == token.head:
            # print(0, token, token.head)
            return [token.text]
        else:
            # print(1, path, token.head)
            return path+go_up_tree(token.head)
        
    path1 = go_up_tree(ent1.root)
    path2 = go_up_tree(ent2.root)
    
    return path1 + path2[:-1][::-1]

In [6]:
# Load model for ner and dependency parsing
nlp = spacy.load('en_tox')

# Run generator pipeline
docs = nlp.pipe([doc['abstract'] for doc in aop_wiki_abstracts if type(doc['abstract'])==str])
pmids = [doc['pmid'] for doc in aop_wiki_abstracts if type(doc['abstract'])==str]
ent_labels_of_interest = ('PHENOTYPE', 'COMPOUND')

# This dictionary will hold the data that will be transformed into a dataframe
data_dict = {
    'entity A':[],
    'entity B':[],
    'label A':[],
    'label B':[],
    'shortest dep path':[],
    'sentence':[],
    'sentence id':[],
    'pmid':[],
    'entity A root':[],
    'entity B root':[]
}

sentence_id = 0
for doc, pmid in tqdm(zip(docs,pmids), total=len(pmids)):
    for sent in doc.sents:
        
        # Is there a molecule and a phenotype in the sentence
        if not set(ent_labels_of_interest).issubset(set([ent.label_ for ent in sent.ents])):
            continue

        # get the shortest dependency path of all combinations of molecule and phenotype
        ents_of_interest = [ent for ent in sent.ents if ent.label_ in ent_labels_of_interest]
        mirrors = []
        for ent1, ent2 in itertools.product(ents_of_interest, ents_of_interest):
            
            ### Skips
            if ent1 == ent2: # skip if ent1 is the same as ent2
                continue
                
            if ent1.label == ent2.label: #skip if labels are the same
                continue
                
            if (ent1, ent2) in mirrors: #SKIP mirrors (ent1+ent2 vs ent2+ent1)
                continue
            
            # Store mirrors
            mirrors.append((ent2, ent1))
            
            # get dependency path
            shortest_dep_path = get_shortest_dependency_path(ent1, ent2) # Get shortest dependecy path
            ent1_root = shortest_dep_path.pop(0)
            ent2_root = shortest_dep_path.pop(-1)
            
            ### Save data 
            data_dict['entity A'].append(ent1.text)
            data_dict['entity B'].append(ent2.text)
            data_dict['label A'].append(ent1.label_)
            data_dict['label B'].append(ent2.label_)
            data_dict['shortest dep path'].append(str(shortest_dep_path))
            data_dict['sentence'].append(sent.text)
            data_dict['sentence id'].append(sentence_id)
            data_dict['pmid'].append(pmid)
            data_dict['entity A root'].append(ent1_root)
            data_dict['entity B root'].append(ent2_root)
        
        # Increment sentence id
        sentence_id = sentence_id + 1

100%|██████████| 4945/4945 [04:48<00:00, 17.12it/s]


##### Small analysis of dependency paths

In [7]:
print('sentences in which at least one molecule name and at least one phenotype are present', len(set(data_dict['sentence id'])))
print('total number of unique molecule-phenotype-sentence combinations', len(set([(entityA, entityB, sentence_id) for entityA, entityB, sentence_id in zip(data_dict['entity A'], data_dict['entity B'], data_dict['sentence id'])])))
print('total number of unique dependency paths (not accounting for mirrors):', len(set(data_dict['shortest dep path'])))

sentences in which at least one molecule name and at least one phenotype are present 5118
total number of unique molecule-phenotype-sentence combinations 11208
total number of unique dependency paths (not accounting for mirrors): 9533


In [8]:
df = pd.DataFrame.from_dict(data_dict)
df.head()

Unnamed: 0,entity A,entity B,label A,label B,shortest dep path,sentence,sentence id,pmid,entity A root,entity B root
0,dioxane,mortality,COMPOUND,PHENOTYPE,"['nmod', 'effect', 'nmod', 'interest', 'nmod',...",As a result of recent interest in the carcinog...,0,,dioxane,mortality
1,Vildagliptin,motor performance,COMPOUND,PHENOTYPE,"['nsubj', 'improved', 'dobj']",Vildagliptin markedly improved the motor perfo...,1,25752913.0,Vildagliptin,performance
2,Vildagliptin,reduction in striatal dopamine content,COMPOUND,PHENOTYPE,"['nsubj', 'improved', 'dep', 'effects', 'acl:r...",Vildagliptin markedly improved the motor perfo...,1,25752913.0,Vildagliptin,reduction
3,advanced glycated end product,vildagliptin,PHENOTYPE,COMPOUND,"['nmod', 'Normalization', 'nsubj', 'finding', ...",Normalization of receptor for advanced glycate...,2,25752913.0,product,vildagliptin
4,vildagliptin,intracellular adhesion molecule-1,COMPOUND,PHENOTYPE,"['nmod', 'effects', 'dobj', 'justifies', 'acl:...",Normalization of receptor for advanced glycate...,2,25752913.0,vildagliptin,molecule-1


## Create sparse matrix

In [37]:
### This is a snippit of the code from 'ebc2d.py' from the function 'get_matrix_from_data()', line 288
### This snippit creates a entity pair to row number and dep path to column number
### And thus is essential to finding out which entity pairs cluster together
### I don't know why this isn't part of the output of get_matrix_from_data

def get_feature_ids(data):
    feature_ids = defaultdict(lambda: defaultdict(int))
    for d in data:
        location = []
        for i in range(len(d) - 1):
            f_i = d[i]
            if f_i not in feature_ids[i]:
                feature_ids[i][f_i] = len(feature_ids[i])  # new index is size of dict
            location.append(feature_ids[i][f_i])
    return feature_ids

In [38]:
data = [
    [(ent1.lower(),ent2.lower()), dep_path, 1.0] 
    for ent1, ent2, dep_path 
    in zip(data_dict['entity A'], data_dict['entity B'], data_dict['shortest dep path'])
]
feature_ids = get_feature_ids(data)
matrix = ebc2d.get_matrix_from_data(data)

## Run EBC

In [10]:
row_clusters = 30
column_clusters = 125
ebc = EBC2D(matrix, n_clusters=[30, 125], max_iterations=10, jitter_max=1e-10, objective_tolerance=0.01)
cXY, objective, it = ebc.run()

Running EBC2D on a 2-d matrix with size (9245, 9533) ...
Randomly initializing clusters, with cluster number on each axis: [30, 125] ...
--> Running iteration 1 .. objective value = 5.774289
--> Running iteration 2 .. objective value = 5.731714
--> Running iteration 3 .. objective value = 5.734511
EBC2D finished in 3 iterations, with final objective value 5.7345


## Analyse results

In [47]:
### TEMP function (this can be way better but in the name of time)
def associated_dep_path(search_entity_pair,data):
    dep_paths = []

    for entity_pair, dep_path, _  in data:
        if entity_pair == search_entity_pair:
            dep_paths.append(dep_path)

    return dep_paths

In [88]:
inverted_row_features = {value: key for key, value in feature_ids[0].items()}
all_cluster_numbers=range(row_clusters)
X = cXY[0]

cluster_data = {}
for current_cluster_number in all_cluster_numbers:
    # get all row indicies of rows that are in a certain cluster
    indicies = [index for index, cluster_number in enumerate(X) if cluster_number==current_cluster_number]
    
    # get all entity pairs that link to that row index
    entity_pairs = [inverted_row_features[index] for index in indicies]
    associated_dependency_paths = [associated_dep_path(entity_pair, data) for entity_pair in entity_pairs]
    # print(set([len(i) for i in associated_dependency_paths]))
    associated_dependency_paths = itertools.chain(*associated_dependency_paths)
    # print(set([len(i) for i in associated_dependency_paths]))
    associated_dependency_paths = [eval(i) for i in associated_dependency_paths]

    cluster_data[current_cluster_number] = {
        'entity_pairs':entity_pairs,
        'dep_paths':associated_dependency_paths
    }

In [91]:
for key in cluster_data.keys():
    msg.divider(str(key))

    # for i in cluster_data[key]['entity_pair']: print(i)

    closest_common_ancestors = {}
    for i in cluster_data[key]['dep_paths']:
        closest_common_ancestors[i[len(i)//2]] = closest_common_ancestors.get(i[len(i)//2], 0) + 1

    list(closest_common_ancestors.values()).sort()
    


[1m
{'show': 7, 'nmod': 36, 'suggest': 1, 'acl': 5, 'related': 1, 'dobj': 17, 'neurons': 1, 'conj': 11, 'investigated': 3, 'Dnmts': 1, 'review': 2, 'nsubj': 26, 'advcl': 9, 'inhibited': 2, 'induces': 1, 'reduced': 1, 'increased': 3, 'similar': 1, 'suggests': 1, 'xcomp': 10, 'embryos': 4, 'damage': 2, 'abnormalities': 2, 'exposure': 4, 'doses': 3, 'organisms': 1, 'competition': 1, 'levels': 1, 'decreased': 2, 'fed': 1, 'ccomp': 13, 'mitochondria': 2, 'induce': 2, 'rats': 2, 'known': 2, 'cause': 3, 'mechanism': 2, 'used': 3, 'associated': 1, 'potential': 1, 'nsubjpass': 5, 'developed': 1, 'Studies': 1, 'investigate': 2, '=': 1, 'comparing': 1, 'compared': 2, 'demonstrates': 1, 'prevent': 1, 'acl:relcl': 2, 'contains': 1, 'required': 1, 'antibody': 2, 'mediate': 2, 'Carbaryl': 1, 'knockdown': 1, 'characterized': 1, 'identifies': 1, 'hypothesis': 1, 'dep': 2, 'support': 2, 'binds': 1, 'CONCLUSIONS': 3, 'proved': 1, 'effects': 2, 'spectrum': 1, 'generated': 1, 'database': 1, 'remain': 1, '