In [58]:
from seq2rel_ds.common import util
import pandas as pd
# from seq2rel_ds.common.util import pubtator_to_seq2rel

In [59]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [60]:

%autoreload 2

import itertools
from collections import defaultdict
from functools import lru_cache
from pathlib import Path
from typing import Dict, List, Optional, Tuple


import requests
import typer

from seq2rel_ds import msg
from seq2rel_ds.common import util
from seq2rel_ds.common.schemas import PubtatorAnnotation
from seq2rel_ds.common.util import EntityHinting
from seq2rel_ds.common.util import pubtator_to_seq2rel

# To reload modules (does not update after updating in other codes)
import importlib
importlib.reload(util)


app = typer.Typer()

CDR_URL = "https://biocreative.bioinformatics.udel.edu/media/store/files/2016/CDR_Data.zip"
MESH_TREE_URL = (
    "https://github.com/fenchri/edge-oriented-graph/raw/master/data_processing/2017MeshTree.txt"
)
PARENT_DIR = "CDR_Data/CDR.Corpus.v010516"
TRAIN_FILENAME = "CDR_TrainingSet.PubTator.txt"
VALID_FILENAME = "CDR_DevelopmentSet.PubTator.txt"
TEST_FILENAME = "CDR_TestSet.PubTator.txt"


@lru_cache()
def _download_mesh_tree() -> Dict[str, List[str]]:
    """Downloads the MeSH tree and returns a dictionary mapping MeSH unique IDs to tree numbers."""
    parsed_mesh_tree = defaultdict(list)
    raw_mesh_tree = requests.get(MESH_TREE_URL).text.strip().splitlines()[1:]
    for line in raw_mesh_tree:
        tree_numbers, mesh_unique_id, _ = line.split("\t")
        parsed_mesh_tree[mesh_unique_id].append(tree_numbers)
    return parsed_mesh_tree


def _download_corpus() -> Tuple[str, str, str]:
    z = util.download_zip(CDR_URL)
    train = z.read(str(Path(PARENT_DIR) / TRAIN_FILENAME)).decode()
    valid = z.read(str(Path(PARENT_DIR) / VALID_FILENAME)).decode()
    test = z.read(str(Path(PARENT_DIR) / TEST_FILENAME)).decode()

    return train, valid, test


def _filter_hypernyms(pubtator_annotations: List[PubtatorAnnotation]) -> None:
    """For each document in `pubtator_annotations`, determines any possible negative relations
    which are hypernyms of the positive relations. If found, these are appended to
    `pubtator_annotations.filtered_relations`.
    """
    # Download the MeSH tree which allows us to determine hypernyms for disease entities.
    mesh_tree = _download_mesh_tree()

    # Determine the entity and relation labels by looping until we find an document with relations.
    for annotation in pubtator_annotations:
        if annotation.relations:
            chem_id, diso_id, rel_label = annotation.relations[0]
            chem_label = annotation.entities[chem_id].label
            diso_label = annotation.entities[diso_id].label
            break

    for annotation in pubtator_annotations:
        # We will add this attribute to each annotation, regardless of whether or not it has
        # relations to filter. This will mean that all examples in the dataset will be formatted
        # the same way, which simplifies data loading.
        annotation.filtered_relations = []
        # Determine the negative relations by taking the set of the product of all unique chemical
        # and disease entities, minus the set of all positive relations.
        chemicals = [
            ent_id for ent_id, ann in annotation.entities.items() if ann.label == chem_label
        ]
        diseases = [
            ent_id for ent_id, ann in annotation.entities.items() if ann.label == diso_label
        ]
        all_relations = [
            (chem, diso, rel_label) for chem, diso in itertools.product(chemicals, diseases)
        ]
        negative_relations = list(set(all_relations) - set(annotation.relations))
        # If any negative relation contains a chemical entity that matches the chemical entity of
        # a positive relation AND its disease entity is a hypernym of the positive relations disease
        # entity, this negative relation should be filtered.
        for neg_chem, neg_diso, _ in negative_relations:
            for pos_chem, pos_diso, _ in annotation.relations:
                if neg_chem == pos_chem:
                    if any(
                        neg_tree_number in pos_tree_number
                        for pos_tree_number in mesh_tree[pos_diso]
                        for neg_tree_number in mesh_tree[neg_diso]
                    ):
                        filtered_rel = (neg_chem, neg_diso, rel_label)
                        if filtered_rel not in annotation.filtered_relations:
                            annotation.filtered_relations.append(filtered_rel)


def _preprocess(
    pubtator_content: str,
    sort_rels: bool = True,
    entity_hinting: Optional[EntityHinting] = None,
    filter_hypernyms: bool = False,
) -> List[str]:
    kwargs = {"concepts": ["chemical", "disease"], "skip_malformed": True} if entity_hinting else {}

    pubtator_annotations = util.parse_pubtator(
        pubtator_content=pubtator_content,
        text_segment=util.TextSegment.both,
    )
    
    # This is unique the the CDR corpus, which contains many negative relations that are
    # actually valid, but are not annotated because they contain a disease entity which is the
    # hypernym of a disease entity in a positive relation. We need to filter these out before
    # evaluation, so this function finds all such cases and adds them to the filtered_relations
    # field of the annoations. See: https://arxiv.org/abs/1909.00228 for details.
    if filter_hypernyms:
        _filter_hypernyms(pubtator_annotations)

#     print('preprocessing from custom functions11')
    seq2rel_annotations = pubtator_to_seq2rel(
        pubtator_annotations,
        sort_rels=sort_rels,
        entity_hinting=entity_hinting,
        **kwargs,
    )
#     print(type(seq2rel_annotations), len(seq2rel_annotations))
    return seq2rel_annotations


In [61]:
entity_hinting = 'pipeline'
output_dir = 'custom_main_test_cdr'

train_raw, valid_raw, test_raw = _download_corpus()
test = _preprocess(test_raw, sort_rels=True, entity_hinting=entity_hinting)


# What is D009270 - how is the entity ID created - From MESH library
# Do we have to keep all the occurences of a word (even if is already recorded) - Yes it should be kept
# How is the offset calculated, is it word inbdex per sesntence or the whole document - whole document

# From MESH library
# 227508\t0\t8\tNaloxone\tChemical\tD009270
# 227508\t865\t873\tnaloxone\tChemical\tD00927
# First is captial N and second is small n and both have same uid. What is this uid

# 227508\t49\t58\tclonidine\tChemical\tD003000
# 227508\t181\t190\tclonidine\tChemical\tD003000

# 227508\t563\t576\t[3H]-naloxone\tChemical\t-1
# 227508\t671\t695\t[3H]-dihydroergocryptine\tChemical\t-1
# 227508\t244\t252\tnalozone\tChemical\t-1

# This has a different uid - how is t-1 different from tD009270
# And [3H]-naloxone is more related to Naloxone. But it has same uid as nalozone t-1


ConnectionError: HTTPSConnectionPool(host='biocreative.bioinformatics.udel.edu', port=443): Max retries exceeded with url: /media/store/files/2016/CDR_Data.zip (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x2b253d9063d0>: Failed to establish a new connection: [Errno 110] Connection timed out'))

In [30]:
print('Total outputs recorded ', len(test))


Total outputs recorded  297


In [45]:
# From our NER output:
for i in test[:4]:
    print(i)
    print('\n')


histamine @CHEMICAL@ [SEP] Famotidine - associated delirium . A series of six cases . Famotidine is a histamine H2 - receptor antagonist used in inpatient settings for prevention of stress ulcers and is showing increasing popularity because of its low cost . Although all of the currently available H2 - receptor antagonists have shown the propensity to cause delirium , only two previously reported cases have been associated with famotidine . The authors report on six cases of famotidine - associated delirium in hospitalized patients who cleared completely upon removal of famotidine . The pharmacokinetics of famotidine are reviewed , with no change in its metabolism in the elderly population seen . The implications of using famotidine in elderly persons are discussed .	


sodium @CHEMICAL@ [SEP] Indomethacin induced hypotension in sodium and volume depleted rats . After a single oral dose of 4 mg / kg indomethacin ( IDM ) to sodium and volume depleted rats plasma renin activity ( PRA ) a

In [14]:
# From pipeline output:
for i in test[:4]:
    print(i)
    print('\n')


famotidine @CHEMICAL@ delirium @DISEASE@ stress ulcers @DISEASE@ [SEP] Famotidine-associated delirium. A series of six cases. Famotidine is a histamine H2-receptor antagonist used in inpatient settings for prevention of stress ulcers and is showing increasing popularity because of its low cost. Although all of the currently available H2-receptor antagonists have shown the propensity to cause delirium, only two previously reported cases have been associated with famotidine. The authors report on six cases of famotidine-associated delirium in hospitalized patients who cleared completely upon removal of famotidine. The pharmacokinetics of famotidine are reviewed, with no change in its metabolism in the elderly population seen. The implications of using famotidine in elderly persons are discussed.	famotidine @CHEMICAL@ delirium @DISEASE@ @CID@


indomethacin @CHEMICAL@ hypotension @DISEASE@ sodium @CHEMICAL@ prostaglandin @CHEMICAL@ [SEP] Indomethacin induced hypotension in sodium and volu

In [32]:
from pathlib import Path
output_dir = Path("custom_main_test_cdr")
output_dir.mkdir(parents=True, exist_ok=True)

(output_dir / "test_custom.tsv").write_text("\n".join(test[:20]))

28839

In [None]:
# https://vsoch.github.io/lessons/sherlock-jobs/

In [33]:
!allennlp evaluate "https://github.com/JohnGiorgi/seq2rel/releases/download/pretrained-models/cdr.tar.gz" \
    "custom_main_test_cdr/test_custom.tsv" \
    --output-file "test_metrics_custom.jsonl" \
    --predictions-output-file "test_predictions_custom.jsonl" \
    --include-package "seq2rel"

# https://github.com/allenai/allennlp/blob/main/allennlp/commands/evaluate.py
# removed --cuda-device 0

2023-02-21 19:00:51.152153: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /N/soft/rhel7/python/3.9.8/lib:/N/soft/rhel7/intel/19.5/compilers_and_libraries_2019.5.281/linux/compiler/lib/intel64:/N/soft/rhel7/intel/19.5/compilers_and_libraries_2019.5.281/linux/ipp/lib/intel64:/N/soft/rhel7/intel/19.5/compilers_and_libraries_2019.5.281/linux/compiler/lib/intel64_lin:/N/soft/rhel7/intel/19.5/compilers_and_libraries_2019.5.281/linux/mkl/lib/intel64_lin:/N/soft/rhel7/intel/19.5/compilers_and_libraries_2019.5.281/linux/tbb/lib/intel64/gcc4.7:/N/soft/rhel7/intel/19.5/debugger_2019/iga/lib:/N/soft/rhel7/intel/19.5/debugger_2019/libipt/intel64/lib:/N/soft/rhel7/intel/19.5/compilers_and_libraries_2019.5.281/linux/daal/lib/intel64_lin:/N/soft/rhel7/gcc/12.1.0/lib64:/N/soft/rhel7/gcc/12.1.0/lib:/N/soft/rhel7/gcc/infrastructure/l

In [41]:
test[310]

'[SEP] Effects of the intracoronary infusion of cocaine on left ventricular systolic and diastolic function in humans. BACKGROUND: In dogs, a large amount of intravenous cocaine causes a profound deterioration of left ventricular (LV) systolic function and an increase in LV end-diastolic pressure. This study was done to assess the influence of a high intracoronary cocaine concentration on LV systolic and diastolic function in humans. METHODS AND RESULTS: In 20 patients (14 men and 6 women aged 39 to 72 years) referred for cardiac catheterization for the evaluation of chest pain, we measured heart rate, systemic arterial pressure, LV pressure and its first derivative (dP/dt), and LV volumes and ejection fraction before and during the final 2 to 3 minutes of a 15-minute intracoronary infusion of saline (n=10, control subjects) or cocaine hydrochloride 1 mg/min (n=10). No variable changed with saline. With cocaine, the drug concentration in blood obtained from the coronary sinus was 3.0+/

In [53]:
import pandas as pd    
jsonObj = pd.read_json(path_or_buf='test_predictions_500.jsonl', lines=True)
for i in range(len(jsonObj['predicted_strings'].iloc[0])):
    if(jsonObj['target_strings'].iloc[0][i]): # Use for test_predictions_500.jsonl
        print('\n Predictions: ', jsonObj['predicted_strings'].iloc[0][i])
        print('Target: ', jsonObj['target_strings'].iloc[0][i], i)
        print('For abstract: ', jsonObj['metadata'].iloc[0][i])

        # Predictions final output of seq2rel
        # Target is the target string (filtered) in cdr
        
# Important things to see:
    # In 5th output, our NER was unable to detect 3-hydroxy-2-butanone (maybe removed because of redundancy issue), but
    # the RE model has picked this entity and predicted as having a relation to disease.
    # This shows that the model does not only depend on the Entity hinting to get entities.


 Predictions:  levodopa @CHEMICAL@ bradykinesia @DISEASE@ @CID@ levodopa @CHEMICAL@ rigidity @DISEASE@ @CID@ levodopa @CHEMICAL@ dyskinesias @DISEASE@ @CID@
Target:  levodopa @CHEMICAL@ @CID@ 1
For abstract:  {'source_tokens': ['[CLS]', 'levodopa', '@', 'chemical', '@', '[SEP]', 'bilateral', 'subth', '##alamic', 'nucleus', 'stimulation', 'for', 'parkinson', "'", 's', 'disease', '.', 'unlabelled', ':', 'high', 'frequency', 'stimulation', 'of', 'the', 'subth', '##alamic', 'nucleus', '(', 'stn', ')', 'is', 'known', 'to', 'ameliorate', 'the', 'signs', 'and', 'symptoms', 'of', 'advanced', 'parkinson', "'", 's', 'disease', '.', 'aim', ':', 'we', 'studied', 'the', 'effect', 'of', 'high', 'frequency', 'stn', 'stimulation', 'in', '23', 'patients', '.', 'method', ':', 'twenty', '-', 'three', 'patients', 'suffering', 'from', 'severe', 'parkinson', "'", 's', 'disease', '(', 'stages', 'iii', '-', 'v', 'on', 'ho', '##eh', '##n', 'and', 'ya', '##hr', 'scale', ')', 'and', ',', 'particularly', 'brady'

In [52]:
import pandas as pd    
jsonObj = pd.read_json(path_or_buf='test_predictions_custom.jsonl', lines=True)
for i in range(len(jsonObj['predicted_strings'].iloc[0])):
#     if(jsonObj['target_strings'].iloc[0][i]): # Use for test_predictions_500.jsonl
    print('\n Predictions: ', jsonObj['predicted_strings'].iloc[0][i])
    print('Target: ', jsonObj['target_strings'].iloc[0][i], i)
    print('For abstract: ', jsonObj['metadata'].iloc[0][i])


# Important things to see:
    # In 5th output, our NER was unable to detect 3-hydroxy-2-butanone (maybe removed because of redundancy issue), but
    # the RE model has picked this entity and predicted as having a relation to disease.
    # This shows that the model does not only depend on the Entity hinting to get entities.


 Predictions:  glucose @CHEMICAL@ @ @ unknown @ @ ion @DISEASE@ @CID@
Target:   0
For abstract:  {'source_tokens': ['[CLS]', 'glucose', '@', 'chemical', '@', '[SEP]', 'objectives', ':', 'the', 'aim', 'of', 'this', 'work', 'was', 'to', 'study', 'the', 'changes', 'of', 'bacterial', 'cell', 'growth', ',', 'acet', '##ion', 'formation', 'and', 'glucose', 'consumption', 'with', 'fermentation', 'time', 'during', 'batch', 'cultivation', '.', 'results', ':', 'a', 'mathematical', 'model', 'of', 'cell', 'growth', ',', 'product', 'synthesis', ',', 'and', 'substrate', 'consumption', 'changes', 'with', 'time', 'during', 'the', 'batch', 'cultivation', 'of', 'acet', '##ion', 'was', 'established', '.', 'by', 'analyzing', 'the', 'fitting', 'curve', 'of', 'the', 'kinetic', 'model', ',', 'it', 'is', 'found', 'that', 'the', 'calculated', 'value', 'of', 'the', 'model', 'fits', 'well', 'with', 'the', 'experimental', 'value', ',', 'and', 'the', 'fitting', 'model', 'r2', 'is', 'greater', 'than', '0', '.', '98

In [25]:
d = {'Magnesium': [('magnesium', (47, 56)), ('magnesium', (193, 202)), ('magnesium', (247, 256)), ('magnesium', (329, 338)), ('magnesium', (795, 804)), ('magnesium', (1044, 1053))]}

d

{'Magnesium': [('magnesium', (47, 56)),
  ('magnesium', (193, 202)),
  ('magnesium', (247, 256)),
  ('magnesium', (329, 338)),
  ('magnesium', (795, 804)),
  ('magnesium', (1044, 1053))]}

In [38]:
[i[0] for i in d['Magnesium']]

['magnesium', 'magnesium', 'magnesium', 'magnesium', 'magnesium', 'magnesium']

In [34]:
nlp = spacy.load("patterns_02_17")

In [59]:
abstract = "Uridine 5'-diphosphate ( UDP ) and uridINe 5' - dipHOSphaTe - glucose dehydrogenase ( UGD ) produces UDP - glucuronic acid from UDP - glucose as a precursor of plant cell wall polysaccharides .UDP - glucuronic acid is also a sugar donor for the glycosylation of various plant specialized metabolites .Nevertheless , the roles of UGDs in plant specialized metabolism remain poorly understood .Glycyrrhiza species ( licorice ) , which are medicinal legumes , biosynthesize triterpenoid saponins , soyasaponins and glycyrrhizin , commonly glucuronosylated at the C - 3 position of the triterpenoid scaffold ."
abstract_spaced = "".join((' {} '.format(el.encode().decode()) if (not el.isalnum() and not el.isspace()) else el for el in abstract))
abstract_preprocessed = " ".join(abstract_spaced.split())
doc = None
doc = nlp(abstract_preprocessed)
entities_dict = {}

#print NER result
for ent in doc.ents:
    # Add to a dict
    if(not entities_dict.get(str(ent.ent_id_))):
        entities_dict[str(ent.ent_id_)] = [(str(ent.text), str(ent.label_), (ent.start_char, ent.end_char))] # Value and its offset  
    else:
        if(str(ent.text) not in [i[0] for i in entities_dict[str(ent.ent_id_)]]): # Do not add if already there
            entities_dict[str(ent.ent_id_)].append((str(ent.text), str(ent.label_), (ent.start_char, ent.end_char)))
print(entities_dict)


{"Uridine 5'-diphosphate": [("Uridine 5 ' - diphosphate", 'Metabolites', (0, 25)), ('UDP', 'Metabolites', (28, 31)), ("uridINe 5 ' - dipHOSphaTe", 'Metabolites', (38, 63))], 'D-Glucose': [('glucose', 'Metabolites', (66, 73))], 'Uridine diphosphate glucuronic acid': [('UDP - glucuronic acid', 'Metabolites', (105, 126))], 'Uridine diphosphate glucose': [('UDP - glucose', 'Metabolites', (132, 145))], 'Sucrose': [('sugar', 'Metabolites', (230, 235))]}


In [60]:
entity_string_list = []

for ent in entities_dict:
    entity_string = ""
    if(len(entities_dict[ent]) > 1):
        entity_string+= str(entities_dict[ent][0][0])
        for inner_ent in entities_dict[ent][1:]:
            entity_string+= ' ; ' + str(inner_ent[0])
        entity_string+=' @' +  str(inner_ent[1]) + '@'
        
    else:
        entity_string+= str(entities_dict[ent][0][0]) + ' @' +  str(entities_dict[ent][0][1]) + '@'
    
    entity_string_list.append(entity_string)

Uridine 5'-diphosphate
D-Glucose
Uridine diphosphate glucuronic acid
Uridine diphosphate glucose
Sucrose


In [61]:
entity_string_list

["Uridine 5 ' - diphosphate ; UDP ; uridINe 5 ' - dipHOSphaTe @Metabolites@",
 'glucose @Metabolites@',
 'UDP - glucuronic acid @Metabolites@',
 'UDP - glucose @Metabolites@',
 'sugar @Metabolites@']

In [67]:
# These entities have spaces and stuffs in them. So before doing RE, send the spaced abstracts and not just abstracts
# They lower all words while storing in test.tsv, and this lower entities are passed to RE
# But keep Disease or Chemical to upper
# Check if even abstracts are converted to lower - No they are kept as same
# Most test text after entity hinting is just no entities - maybe it is because of cdr data. If no entities detected, then ignore. No need to add.

# Change @metabolism@ to @chemical@ in hinting
# In test relation target, filter out entities in target which do not have words predicted by our ner - cdr
# Two character words need to be case sensitive - 

# Switch to Bloom
# And try biomedical entities

# Disease names combine - 
# Works here but not when ran as python seq2rel-ds/cdr.py
# Do allen predict - predictions have chemical and disease entities. Need to do finetuning on this pretrained model.

# What if the relation target does not contain any of the NER terms, we need some string for evaluation.

In [4]:
l = ['naloxone @CHEMICAL@', 'clonidine @CHEMICAL@', 'hypertensive @DISEASE@', 'nalozone ; [3h]-naloxone ;\
[3h]-dihydroergocryptine @CHEMICAL@', 'hypotensive @DISEASE@', 'alpha-methyldopa @CHEMICAL@']
l

['naloxone @CHEMICAL@',
 'clonidine @CHEMICAL@',
 'hypertensive @DISEASE@',
 'nalozone ; [3h]-naloxone ;[3h]-dihydroergocryptine @CHEMICAL@',
 'hypotensive @DISEASE@',
 'alpha-methyldopa @CHEMICAL@']

In [22]:
%reload_ext autoreload
%autoreload 2

!python seq2rel_ds/cdr.py --entity_hinting 'pipeline'


[1m
[2K[38;5;2m✔ Downloaded the corpus.[0m
[38;5;4mℹ Entity hints will be inserted into the source text using the
annotations from PubTator.[0m
[38;5;4mℹ Training and validation sets will be combined into one train set.[0m
[2K[38;5;2m✔ Preprocessed the data.[0m
[38;5;2m✔ Preprocessed data saved to
/N/project/zhangclab/Pavi/KD-DocRE/RE/seq2rel-ds/custom_main_test_cdr.[0m
