In [1]:
import numpy as np
import json
from joblib import Parallel, delayed
from pathlib import Path

from tqdm import tqdm
from HGCR_util.emb_lookup.abstr_emb_lookup import MedlineNumpyEmbeddings
from HGCR_util import text_util
from HGCR_util.lazy_json_kv_loader import LazyJsonlAbstractLoader
import pandas as pd
import random

from agatha_construct.semrep_handler import SemRepHandler
import re
import torch

In [2]:
tqdm.pandas()

### Load embeddings

In [3]:
embeddings_fpath = Path(
    '/work/acslab/shared/medline_embeddings/PubMedNCL_abstr/abstr_id_to_emb_PubMedNCL'
)

In [4]:
pubmedncl_emb_obj = MedlineNumpyEmbeddings(
    emb_w_ids_fpath=embeddings_fpath,
    json_read_n_jobs=8,
)

Opening np chunks: 100%|█████████████████████| 102/102 [00:00<00:00, 731.42it/s]
Opening json index chunks: 100%|██████████████| 102/102 [00:10<00:00,  9.70it/s]
Constructing PMID lookup index: 100%|█████████| 102/102 [00:13<00:00,  7.55it/s]


In [5]:
sent_jsonl_dir = Path(
    '/work/acslab/shared/Agatha_shared/'
    'pmid_to_sent_id_w_text_kv_jsonl_lazy_chunks'
)

abstr_db = LazyJsonlAbstractLoader(
    list(sent_jsonl_dir.glob('*jsonl'))
)

Indexing jsonl files: 100%|███████████████████| 109/109 [03:51<00:00,  2.12s/it]


In [6]:
## sample 1000 pmids
all_pmids_list = []
for chunk in pubmedncl_emb_obj.pmids_chunk_list:
    for pmid in chunk:
        all_pmids_list.append(pmid)
len(all_pmids_list)

30120535

In [7]:
random.seed(42)
sampled_pmids = random.sample(all_pmids_list, 100000)
len(sampled_pmids)

100000

In [8]:
sampled_abstracts = [text_util.get_abstr_text(pmid, abstr_db) for pmid in sampled_pmids]
len(sampled_abstracts)

100000

In [9]:
sampled_embeds = [pubmedncl_emb_obj[pmid] for pmid in sampled_pmids]
len(sampled_embeds)

100000

In [12]:
sampled_embeds[0]

memmap([-7.46442974e-01,  3.35729361e-01, -2.34430373e-01,
        -2.80785382e-01, -1.73664778e-01,  3.13049763e-01,
        -3.62970799e-01,  5.90586662e-01, -2.91971248e-02,
        -5.05837619e-01,  6.55847788e-01,  1.25187624e+00,
        -4.16735560e-01,  1.64797962e-01,  1.06502764e-01,
         1.49024641e-02, -1.61293542e+00, -5.15955389e-01,
         1.44695537e-02,  2.00267136e-01,  7.18324542e-01,
         2.33138371e-02, -3.13486725e-01,  8.97246540e-01,
         3.38288367e-01,  4.17616040e-01, -3.41324598e-01,
         4.10070181e-01, -6.99992120e-01, -1.40942872e+00,
        -1.80755734e-01,  4.63272668e-02,  1.25755385e-01,
         7.47281373e-01, -4.08720136e-01, -4.35202569e-01,
         1.58904180e-01, -3.86107773e-01, -3.18681031e-01,
        -1.17502056e-01, -1.75008178e-02, -1.04922152e+00,
         3.96252185e-01, -7.63067722e-01,  1.59524992e-01,
        -5.22287965e-01, -2.87373573e-01,  1.66103557e-01,
         2.96335638e-01, -1.39071301e-01, -2.54873663e-0

In [10]:
data = {'pmid': sampled_pmids, 'pmid_text': sampled_abstracts, 'pmid_embeddings': sampled_embeds}

df = pd.DataFrame(data)

In [11]:
df.head()

Unnamed: 0,pmid,pmid_text,pmid_embeddings
0,23622459,Mitral annulus calcification and sudden death....,"[-0.746443, 0.33572936, -0.23443037, -0.280785..."
1,35184714,Sociodemographic and Diabetes-Related Risk Fac...,"[0.00021144087, -0.38737687, -0.26618272, 0.10..."
2,33583333,Multispectral and molecular modeling investiga...,"[-0.5143313, -0.6871651, 0.46939507, 0.2673885..."
3,12958355,Seasonal dynamics of previously unknown fungal...,"[0.0639993, -0.31387797, 0.05587074, -0.371296..."
4,23530622,pH-responsive assembly of gold nanoparticles a...,"[-0.18710473, -0.55765355, -0.4391491, 0.46652..."


In [12]:
df.iloc[0]['pmid_embeddings'].shape

(768,)

In [13]:
pmid_dict = {}
for idx, row in df.iterrows():
    pmid_dict.update({row['pmid']: row['pmid_text']})



### Run semrep on abstracts to retrieve pairs

In [14]:
nlm_soft_folder = '/work/acslab/users/manning/NLM_soft'
sr_temp_folder = '/work/acslab/users/manning/semrep_temp_jan_2025'
sr_replace_utf8_path = '/work/acslab/users/manning/NLM_soft/replace_utf8.jar'

In [15]:
t = SemRepHandler(
    nlm_soft_path=nlm_soft_folder,
    temp_folder=sr_temp_folder,
    #restart_mm_services=True,
    replace_utf8_path=sr_replace_utf8_path,
)

Existing services killed.
Starting wsdserverctl: 
started.
loading properties file /work/acslab/users/manning/NLM_soft/public_mm/WSD_Server/config/disambServer.cfg

SKR and WSD services started.
WSD Server initializing disambiguation methods.
WSD Server databases and disambiguation methods have been initialized.


In [16]:
t.sr_binary_path = Path(
    '/work/acslab/users/manning/NLM_soft/public_semrep/bin/semrep.v1.9_2021AB'
)

In [None]:
sr_output_dict = t.ProcessList_parallel(pmid_dict, chunkSize=50)

Run SemRep in interactive mode...
Processing input with replace_utf8.jar utility...


In [21]:
def get_sr_preds(pmid):
    preds = []
    for k,v in sr_output_dict.items():
        if str(pmid) in k:
            preds.append(v.get('relations'))
    return preds



In [22]:
df['sr_preds'] = df['pmid'].progress_apply(get_sr_preds)
df.head()

Unnamed: 0,pmid,pmid_text,pmid_embeddings,sr_preds
0,23622459,Mitral annulus calcification and sudden death....,"[-0.746443, 0.33572936, -0.23443037, -0.280785...","[[], [{'subj_id': 'C0018787', 'subj_name': 'He..."
1,35184714,Sociodemographic and Diabetes-Related Risk Fac...,"[0.00021144087, -0.38737687, -0.26618272, 0.10...","[[], [{'subj_id': 'C0241863', 'subj_name': 'Di..."
2,33583333,Multispectral and molecular modeling investiga...,"[-0.5143313, -0.6871651, 0.46939507, 0.2673885...","[[], [{'subj_id': 'C0036774', 'subj_name': 'se..."
3,12958355,Seasonal dynamics of previously unknown fungal...,"[0.0639993, -0.31387797, 0.05587074, -0.371296...","[[], [], [], [], [], []]"
4,23530622,pH-responsive assembly of gold nanoparticles a...,"[-0.18710473, -0.55765355, -0.4391491, 0.46652...","[[], [], [], [], [], [], [], [], [], [{'subj_i..."


In [23]:
def parse_sr_preds(l):
    pairs = []
    for sl in l:
        if len(sl) < 1:
            continue
        for rel in sl:
            pref_subj_name = rel.get('subj_name')
            pref_obj_name = rel.get('obj_name')
            pairs.append(tuple(sorted([pref_subj_name, pref_obj_name])))
    return pairs


Bad id: 25869982 Error code: 124
Skipped record: 7627597|Measure performance for a greater sense of accountability.

Bad id: 21822087 Error code: 124
Skipped record: 11992685|Terpenes in ethanol: haloperidol permeation and partition through human skin and stratum corneum changes. Carvacrol, linalool and alpha-terpineol (5% w/v) in 50% ethanol were used to enhance the permeation of haloperidol (HP) through human skin in vitro and their enhancement mechanism was investigated with HP-stratum corneum (SC) binding studies, fourier transform infrared spectroscopy (FT-IR) and differential scanning calorimetry (DSC). Carvacrol followed by terpineol and linalool enhanced flux and permeability coefficient but only carvacrol provided the required plasma concentration and the permeated daily doses. All terpenes increased the activity coefficient of HP in the skin. Carvacrol increased the lag time, which could be due to slow redistribution within SC. The thermogram of hydrated SC showed two lipid e

In [24]:
pd.set_option('max_colwidth', None)
df['sr_pairs'] = df['sr_preds'].apply(parse_sr_preds)
df.iloc[2]['sr_preds'], df.iloc[2]['sr_pairs']

([[],
  [{'subj_id': 'C0036774',
    'subj_name': 'serum albumin, bovine',
    'subj_text': 'bovine/human serum albumin',
    'subj_sem_type': 'aapp',
    'subj_negated': False,
    'verb': 'PART_OF',
    'verb_negated': False,
    'obj_id': 'C0086418',
    'obj_name': 'Homo sapiens',
    'obj_text': 'human',
    'obj_sem_type': 'humn',
    'obj_negated': False},
   {'subj_id': 'C0036774',
    'subj_name': 'serum albumin, bovine',
    'subj_text': 'bovine/human serum albumin',
    'subj_sem_type': 'aapp',
    'subj_negated': False,
    'verb': 'INTERACTS_WITH',
    'verb_negated': False,
    'obj_id': 'C0009028',
    'obj_name': 'clopidol',
    'obj_text': 'clopidol',
    'obj_sem_type': 'orch',
    'obj_negated': False}],
  [],
  [],
  [{'subj_id': 'C0036774',
    'subj_name': 'serum albumin, bovine',
    'subj_text': 'BSA/HSA',
    'subj_sem_type': 'aapp',
    'subj_negated': False,
    'verb': 'PART_OF',
    'verb_negated': False,
    'obj_id': 'C0086418',
    'obj_name': 'Homo sapi

Bad id: 9914587 Error code: 124
Skipped record: 29631522|Moral Judgment in Old Age. Younger (21-39 years) and older (63-90 years) adults were presented with scenarios illustrating either harmful or helpful actions. Each scenario provided information about the agent's intention, either neutral or valenced (harmful/helpful), and the outcome of his or her action, either neutral or valenced. Participants were asked to rate how morally good or bad the agent's action was. In judging harmful actions, older participants relied less on intentions and more on outcomes compared to younger participants. This age-related difference was associated with a decline in older adults' theory of mind abilities. However, we did not find evidence of any significant age-related difference in the evaluations of helpful actions. We argue that the selective association of aging with changes in the evaluation of harmful but not helpful actions may be due also to motivational factors and highlight some implication

In [25]:
df.to_pickle('dataset_df.pkl')

Bad id: 21242390 Error code: 124
Skipped record: 19138353|Effects of isovalerate on ruminal fermentation, urinary excretion  of purine derivatives and digestibility in steers. The objective of this study was to evaluate the effects of isovalerate supplementation on rumen fermentation, urinary excretion of purine derivatives and feed digestibility in the total tract of steers. Eight ruminally cannulated Simmental steers were used in a replicated 4 . 4 Latin square experiment. The treatments were: control (without isovalerate), low isovalerate (LIV), medium isovalerate (MIV) and high isovalerate (HIV) dosage of isovalerate at 100, 200 and 300 mg isovalerate per kg dry matter (DM) intake respectively. Diets consisted of corn stover and concentrate (60/40, DM basis). Dry matter intake was approximately 9 kg per day that was 90% of ad libitum intake including 5.4 kg corn stover and 3.6 kg concentrate. Ruminal pH (6.72-6.54) was linearly (p < 0.03) reduced, whereas total volatile fatty acid 

In [4]:
df = pd.read_pickle('dataset_df.pkl')

### Embed pairs with bert-base-uncased

In [5]:
# Load model directly
from transformers import AutoModel, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModel.from_pretrained("google-bert/bert-base-uncased")



In [6]:
test_pair = df.iloc[0]['sr_pairs'][0]
test_pair

('Degenerative disorder', 'Heart')

In [7]:
def embed(row):
    pairs = [pair for pair in row]
    embeds = []
    
    for pair in pairs:
        t = []
        for term in pair:
            encoded_dict = tokenizer.encode_plus(
                term,
                add_special_tokens=True,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
            with torch.no_grad():
                outputs = model(
                    input_ids=encoded_dict['input_ids'],
                    attention_mask=encoded_dict['attention_mask']
                )
            # Extract CLS token embedding
            cls_embeddings = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
            t.append(cls_embeddings)
        embeds.append(tuple(t))

    return embeds

In [8]:
df['sr_pairs_embeddings'] = df['sr_pairs'].progress_apply(embed)
df.head(1)

100%|███████████████████████████████████████| 1000/1000 [41:43<00:00,  2.50s/it]


Unnamed: 0,pmid,pmid_text,pmid_embeddings,sr_preds,sr_pairs,sr_pairs_embeddings
0,23622459,Mitral annulus calcification and sudden death....,"[-0.746443, 0.33572936, -0.23443037, -0.280785...","[[], [{'subj_id': 'C0018787', 'subj_name': 'He...","[(Degenerative disorder, Heart), (Degenerative...","[([-0.34357956, 0.12238737, -0.69660324, -0.57..."


In [9]:
df.to_pickle('dataset_df.pkl')

In [3]:
df = pd.read_pickle('dataset_df.pkl')

### Generate negatives

In [23]:
random.seed(42)
unique_rows = df.sample(n=2, replace=False, random_state=42)
for row in unique_rows.itertuples():
    rand_pair = random.choice(row.sr_pairs)
    rand_term = random.choice(rand_pair)
    print(rand_term)

Bone Transplantation
Resuscitation procedure


In [31]:
def generate_negatives(row, df):
    negative_pairs = []
    negative_embeddings = []

    for subj, obj in row['sr_pairs']:
        unique_rows = df[df['pmid'] != row['pmid']].sample(n=2, replace=False, random_state=42)

        for random_row in unique_rows.itertuples():
            if not random_row.sr_pairs or not isinstance(random_row.sr_pairs, list):
                continue

            rand_pair = random.choice(random_row.sr_pairs)
            if not isinstance(rand_pair, tuple) or len(rand_pair) != 2:
                continue 
                
            rand_term = random.choice(rand_pair)

            negative_pair = (subj, rand_term)
            negative_pairs.append(negative_pair)

            subj_embedding = row['sr_pairs_embeddings'][row['sr_pairs'].index((subj, obj))][0]
            rand_term_index = random_row.sr_pairs.index(rand_pair)
            rand_term_position = rand_pair.index(rand_term)
            rand_term_embedding = random_row.sr_pairs_embeddings[rand_term_index][rand_term_position]
            negative_embeddings.append((subj_embedding, rand_term_embedding))
            
    return negative_pairs, negative_embeddings
    
        

In [32]:
df['negative_sr_pairs'], df['negative_sr_pairs_embeddings'] = zip(*df.apply(lambda row: generate_negatives(row, df), axis=1))


In [34]:
all_pos = 0
all_neg = 0
for idx, row in df.iterrows():
    all_pos += len(row['sr_pairs'])
    all_neg += len(row['negative_sr_pairs'])

all_pos, all_neg

(3250, 3250)

In [35]:
df.to_pickle('dataset_df.pkl')

In [37]:
df = pd.read_pickle('dataset_df.pkl')
df.head()

Unnamed: 0,pmid,pmid_text,pmid_embeddings,sr_preds,sr_pairs,sr_pairs_embeddings,negative_sr_pairs,negative_sr_pairs_embeddings
0,23622459,Mitral annulus calcification and sudden death....,"[-0.746443, 0.33572936, -0.23443037, -0.280785...","[[], [{'subj_id': 'C0018787', 'subj_name': 'He...","[(Degenerative disorder, Heart), (Degenerative...","[([-0.34357956, 0.12238737, -0.69660324, -0.57...","[(Degenerative disorder, Blood), (Degenerative...","[([-0.34357956, 0.12238737, -0.69660324, -0.57..."
1,35184714,Sociodemographic and Diabetes-Related Risk Fac...,"[0.00021144087, -0.38737687, -0.26618272, 0.10...","[[], [{'subj_id': 'C0241863', 'subj_name': 'Di...","[(Diabetic, Persons), (Diabetic, Intervention ...","[([-0.94334424, -0.33993545, -0.54449755, -0.3...","[(Diabetic, Structure of renal vein), (Diabeti...","[([-0.94334424, -0.33993545, -0.54449755, -0.3..."
2,33583333,Multispectral and molecular modeling investiga...,"[-0.5143313, -0.6871651, 0.46939507, 0.2673885...","[[], [{'subj_id': 'C0036774', 'subj_name': 'se...","[(Homo sapiens, serum albumin, bovine), (clopi...","[([-0.69680464, -0.12840356, -0.6038127, 0.246...","[(Homo sapiens, Patients), (clopidol, Patients...","[([-0.69680464, -0.12840356, -0.6038127, 0.246..."
3,12958355,Seasonal dynamics of previously unknown fungal...,"[0.0639993, -0.31387797, 0.05587074, -0.371296...","[[], [], [], [], [], []]",[],[],[],[]
4,23530622,pH-responsive assembly of gold nanoparticles a...,"[-0.18710473, -0.55765355, -0.4391491, 0.46652...","[[], [], [], [], [], [], [], [], [], [{'subj_i...","[(Phagocytes, Tumor cells, malignant)]","[([-0.24778794, 0.6601154, -0.24169184, 0.0484...","[(Phagocytes, Gene Mutation)]","[([-0.24778794, 0.6601154, -0.24169184, 0.0484..."


In [40]:
df = df[df['sr_pairs'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
df.head()

Unnamed: 0,pmid,pmid_text,pmid_embeddings,sr_preds,sr_pairs,sr_pairs_embeddings,negative_sr_pairs,negative_sr_pairs_embeddings
0,23622459,Mitral annulus calcification and sudden death....,"[-0.746443, 0.33572936, -0.23443037, -0.280785...","[[], [{'subj_id': 'C0018787', 'subj_name': 'He...","[(Degenerative disorder, Heart), (Degenerative...","[([-0.34357956, 0.12238737, -0.69660324, -0.57...","[(Degenerative disorder, Blood), (Degenerative...","[([-0.34357956, 0.12238737, -0.69660324, -0.57..."
1,35184714,Sociodemographic and Diabetes-Related Risk Fac...,"[0.00021144087, -0.38737687, -0.26618272, 0.10...","[[], [{'subj_id': 'C0241863', 'subj_name': 'Di...","[(Diabetic, Persons), (Diabetic, Intervention ...","[([-0.94334424, -0.33993545, -0.54449755, -0.3...","[(Diabetic, Structure of renal vein), (Diabeti...","[([-0.94334424, -0.33993545, -0.54449755, -0.3..."
2,33583333,Multispectral and molecular modeling investiga...,"[-0.5143313, -0.6871651, 0.46939507, 0.2673885...","[[], [{'subj_id': 'C0036774', 'subj_name': 'se...","[(Homo sapiens, serum albumin, bovine), (clopi...","[([-0.69680464, -0.12840356, -0.6038127, 0.246...","[(Homo sapiens, Patients), (clopidol, Patients...","[([-0.69680464, -0.12840356, -0.6038127, 0.246..."
4,23530622,pH-responsive assembly of gold nanoparticles a...,"[-0.18710473, -0.55765355, -0.4391491, 0.46652...","[[], [], [], [], [], [], [], [], [], [{'subj_i...","[(Phagocytes, Tumor cells, malignant)]","[([-0.24778794, 0.6601154, -0.24169184, 0.0484...","[(Phagocytes, Gene Mutation)]","[([-0.24778794, 0.6601154, -0.24169184, 0.0484..."
5,27034521,Assessment of universal health coverage for ad...,"[-0.008684545, -0.1903507, 0.29093307, -0.0698...","[[], [], [{'subj_id': 'C0043237', 'subj_name':...","[(Study, World Health Organization), (Outpatie...","[([-0.0928604, 0.39379418, -0.14742967, -0.051...","[(Study, Patients), (Outpatients, Structure of...","[([-0.0928604, 0.39379418, -0.14742967, -0.051..."


In [4]:
df = pd.read_pickle('data/1000_pmid_dataset.pkl')
df.iloc[0]['sr_pairs'], df.iloc[0]['negative_sr_pairs']

([('Degenerative disorder', 'Heart'),
  ('Degenerative disorder', 'Old age'),
  ('Sudden death', 'Valvular stenosis'),
  ('Arteriosclerosis', 'Woman'),
  ('Hypertensive disease', 'Woman'),
  ('Calcinosis', 'Structure of anulus fibrosus of mitral orifice')],
 [('Degenerative disorder', 'Blood'),
  ('Degenerative disorder', 'Structure of renal vein'),
  ('Sudden death', 'Patients'),
  ('Arteriosclerosis', 'Patients'),
  ('Hypertensive disease', 'Life Expectancy Less than Ten Years'),
  ('Calcinosis', 'Blood')])