In [1]:
import numpy
import json
from pathlib import Path
from importlib import reload
import sys
import os
sys.path.append(os.path.abspath('..'))
import pandas as pd
import random
from transformers import AutoModel, AutoTokenizer
from utils.SemrepOutputLoader import SemrepOutputLoader
from utils.TermEmbedder import TermEmbedder
from HGCR_util.lazy_json_kv_loader import LazyJsonlAbstractLoader
from sklearn.model_selection import train_test_split
from HGCR_util import text_util
from HGCR_util.emb_lookup.abstr_emb_lookup import MedlineNumpyEmbeddings
from tqdm import tqdm
random.seed(42)

tqdm.pandas()

In [2]:
import importlib
import utils

importlib.reload(utils.SemrepOutputLoader)

<module 'utils.SemrepOutputLoader' from '/work/acslab/users/manning/NLM_soft/agatha_0724/BioPredClassifier/utils/SemrepOutputLoader.py'>

In [3]:
num_samples = 60000


In [4]:
embeddings_fpath = Path(
    '/work/acslab/shared/medline_embeddings/PubMedNCL_abstr/abstr_id_to_emb_PubMedNCL'
)

pubmedncl_emb_obj = MedlineNumpyEmbeddings(
    emb_w_ids_fpath=embeddings_fpath,
    json_read_n_jobs=8,
)


Opening np chunks: 100%|█████████████████████| 102/102 [00:00<00:00, 915.34it/s]
Opening json index chunks: 100%|██████████████| 102/102 [00:10<00:00,  9.29it/s]
Constructing PMID lookup index: 100%|█████████| 102/102 [00:14<00:00,  7.28it/s]


In [5]:
sent_jsonl_dir = Path(
    '/work/acslab/shared/Agatha_shared/'
    'pmid_to_sent_id_w_text_kv_jsonl_lazy_chunks'
)

abstr_db = LazyJsonlAbstractLoader(
    list(sent_jsonl_dir.glob('*jsonl'))
)

Indexing jsonl files: 100%|███████████████████| 109/109 [03:57<00:00,  2.18s/it]


In [6]:
sr_output_dict = SemrepOutputLoader('/work/acslab/shared/rel_extraction/semrep_data/data_2021_onw/agatha_medline_sentences')
sr_output_dict._load_single_pkl_file('pubmed25n1064_sentences.pkl')
sr_output_dict._load_single_pkl_file('pubmed25n1066_sentences.pkl')
sr_output_dict._load_single_pkl_file('pubmed25n1068_sentences.pkl')
sr_output_dict._load_single_pkl_file('pubmed25n1070_sentences.pkl')
sr_output_dict._load_single_pkl_file('pubmed25n1072_sentences.pkl')
sr_output_dict._load_single_pkl_file('pubmed25n1074_sentences.pkl')
sr_output_dict._load_single_pkl_file('pubmed25n1076_sentences.pkl')


In [7]:
sr_output_dict._load_single_pkl_file('pubmed25n1078_sentences.pkl')
sr_output_dict._load_single_pkl_file('pubmed25n1080_sentences.pkl')
sr_output_dict._load_single_pkl_file('pubmed25n1082_sentences.pkl')
sr_output_dict._load_single_pkl_file('pubmed25n1084_sentences.pkl')

In [12]:
sampled_pmids = list(sr_output_dict.data_dict.keys())
filtered_sampled_pmids = [i for i in sampled_pmids if len(sr_output_dict[i][0]['relations']) >= 1]
len(sampled_pmids)
# sr_output_dict.data_dict['33383987']

194877

In [14]:
total_pairs = 0
total_pmids = 0
for pmids, item in sr_output_dict.data_dict.items():
    total_pmids += 1
    terms = 0
    for term in item[0].get('terms'):
        terms += 1
    total_pairs += (terms * (terms-1)) / 2


In [15]:
total_pairs/total_pmids

35.182622885204516

In [16]:
sampled_abstracts = {pmid: text_util.get_abstr_text(pmid, abstr_db) for pmid in tqdm(sampled_pmids, desc="Retrieving Abstracts...")}
len(sampled_abstracts)

Retrieving Abstracts...: 100%|████████| 194877/194877 [00:39<00:00, 4949.83it/s]


194877

In [17]:
sr_output_dict.data_dict['33383987']

[{'terms': {'cases',
   'catalytic activity',
   'contributions',
   'cooling',
   'degrees C',
   'dependence',
   'entropic',
   'fold',
   'highest',
   'hydride',
   'increase',
   'individual',
   'kinetic barrier',
   'lead',
   'outer',
   'sphere',
   'splitting',
   'temperature',
   'tetrahydrofuran',
   'toluene',
   'unique'},
  'relations': {('individual', 'kinetic barrier'),
   ('lead', 'tetrahydrofuran')},
  'negatives': {('cases', 'catalytic activity'),
   ('cases', 'contributions'),
   ('cases', 'cooling'),
   ('cases', 'degrees C'),
   ('cases', 'dependence'),
   ('cases', 'entropic'),
   ('cases', 'fold'),
   ('cases', 'highest'),
   ('cases', 'hydride'),
   ('cases', 'increase'),
   ('cases', 'individual'),
   ('cases', 'kinetic barrier'),
   ('cases', 'lead'),
   ('cases', 'outer'),
   ('cases', 'sphere'),
   ('cases', 'splitting'),
   ('cases', 'temperature'),
   ('cases', 'tetrahydrofuran'),
   ('cases', 'toluene'),
   ('cases', 'unique'),
   ('catalytic activity

In [18]:
sampled_embeds = {}
for pmid in sampled_pmids:
    try:
        sampled_embeds[pmid] = pubmedncl_emb_obj[pmid]
    except:
        continue
len(sampled_embeds)

194874

In [21]:
sr_terms = {pmid: sr_output_dict[pmid][0].get('terms') for pmid in tqdm(sampled_pmids, desc="Retrieving Semrep terms...")}
sr_pairs = {pmid: sr_output_dict[pmid][0].get('relations') for pmid in tqdm(sampled_pmids, desc="Retrieving Semrep pairs...")}



[A[Aving Semrep terms...:   0%|                    | 0/194877 [00:00<?, ?it/s]

[A[Aving Semrep terms...:  46%|█▊  | 90038/194877 [00:00<00:00, 900335.78it/s]

Retrieving Semrep terms...: 100%|███| 194877/194877 [00:00<00:00, 820796.92it/s]


[A[Aving Semrep pairs...:   0%|                    | 0/194877 [00:00<?, ?it/s]

[A[Aving Semrep pairs...:  48%|█▉  | 94344/194877 [00:00<00:00, 943375.67it/s]

Retrieving Semrep pairs...: 100%|███| 194877/194877 [00:00<00:00, 901498.52it/s]


In [22]:
all_terms = []
for terms in sr_terms.values():
    for term in terms:
        all_terms.append(term)
for pairs in sr_pairs.values():
    for t1, t2 in pairs:
        all_terms.append(t1)
        all_terms.append(t2)
all_terms = list(set(all_terms))
len(all_terms), all_terms[0]

(202682, 'waiting list')

In [23]:
term_embedder = TermEmbedder()

In [24]:
term_embeddings = term_embedder._batch_term_embeds(all_terms)


In [15]:
import pickle
with open('term_embeddings/term_embeddings_60k_pmid.pkl', 'wb') as file:
    pickle.dump(term_embeddings, file)

FileNotFoundError: [Errno 2] No such file or directory: 'term_embeddings/term_embeddings_60k_pmid.pkl'

In [25]:
pmids_w_all_info = [pmid for pmid in sampled_pmids if pmid in sampled_abstracts and pmid in sampled_embeds]
len(pmids_w_all_info)

194874

In [26]:
df = pd.DataFrame({
    'pmid': [pmid for pmid in pmids_w_all_info],
    'abstract': [sampled_abstracts[pmid] for pmid in pmids_w_all_info],
    'abstract_embeddings': [sampled_embeds[pmid] for pmid in pmids_w_all_info],
    'terms': [list(sr_output_dict[pmid][0]['terms']) for pmid in pmids_w_all_info],
    'relations': [list(sr_output_dict[pmid][0]['relations']) for pmid in pmids_w_all_info],
    'negatives': [list(sr_output_dict[pmid][0]['negatives']) for pmid in pmids_w_all_info],

})

df.head()

Unnamed: 0,pmid,abstract,abstract_embeddings,terms,relations,negatives
0,33383987,Temperature and Solvent Effects on H2 Splittin...,"[-0.28345045, -0.54292804, 0.16380529, 0.15494...","[fold, lead, cooling, tetrahydrofuran, splitti...","[(individual, kinetic barrier), (lead, tetrahy...","[(fold, tetrahydrofuran), (cases, toluene), (e..."
1,33383988,Mechanism for Rapid Conversion of Amines to Am...,"[-0.2850943, -0.80024165, -0.11266672, 0.14250...","[Particle, Conversion, Amines, Ammonium Salts,...","[(Amines, Ammonium Salts)]","[(Particle, Rapid), (Conversion, Rapid), (Conv..."
2,33383989,Bisecting GlcNAc Protein N-Glycosylation Is Ch...,"[-0.040046073, -1.0489765, -0.37665504, -0.493...","[Protein N-Glycosylation, Adipogenesis, Human,...","[(Adipogenesis, Human)]","[(Adipogenesis, Protein N-Glycosylation), (Cha..."
3,33383992,Association of Exposure to Cattle with Self-Re...,"[-0.310526, -0.5039563, 0.6601208, -0.08117764...","[relationship, human, study, bovine tuberculos...","[(health, human)]","[(health, relationship), (bovine tuberculosis,..."
4,33383995,The Effect of Preoperative Video Based Pain Tr...,"[0.091471456, -0.07006314, -0.11821665, -0.306...","[Analgesic, Control Group, Effect, Video, Pain...","[(Postoperative Pain, Total Knee Arthroplasty)...","[(Analgesic, Postoperative Pain), (Effect, Vid..."


In [28]:
def map_terms(row):
    terms = row['terms']
    embeds = []
    for term in terms:
        embeds.append(term_embeddings[term])
    return embeds

def map_pairs(row):
    pairs = row['relations']
    pair_embeds = []
    for t1, t2 in pairs:
        t1_embed = term_embeddings[t1]
        t2_embed = term_embeddings[t2]
        pair_embeds.append((t1_embed, t2_embed))
    return pair_embeds

def map_negatives(row):
    negatives = row['negatives']
    negative_embeds = []
    for t1, t2 in negatives:
        t1_embed = term_embeddings[t1]
        t2_embed = term_embeddings[t2]
        negative_embeds.append((t1_embed, t2_embed))
    return negative_embeds
        

In [31]:
df['term_embeddings'] = df.progress_apply(map_terms, axis=1)
df['pair_embeddings'] = df.progress_apply(map_pairs, axis=1)
df['negatives_embeddings'] = df.progress_apply(map_negatives, axis=1)
df.head()

100%|███████████████████████████████| 194874/194874 [00:01<00:00, 104433.05it/s]
100%|███████████████████████████████| 194874/194874 [00:01<00:00, 108145.89it/s]
100%|████████████████████████████████| 194874/194874 [00:03<00:00, 50372.67it/s]


Unnamed: 0,pmid,abstract,abstract_embeddings,terms,relations,negatives,term_embeddings,pair_embeddings,negatives_embeddings
0,33383987,Temperature and Solvent Effects on H2 Splittin...,"[-0.28345045, -0.54292804, 0.16380529, 0.15494...","[fold, lead, cooling, tetrahydrofuran, splitti...","[(individual, kinetic barrier), (lead, tetrahy...","[(fold, tetrahydrofuran), (cases, toluene), (e...","[[-0.28223613, -0.0036575377, 0.08042424, -0.1...","[([-0.7495256, 0.36571825, -0.84759635, -0.030...","[([-0.28223613, -0.0036575377, 0.08042424, -0...."
1,33383988,Mechanism for Rapid Conversion of Amines to Am...,"[-0.2850943, -0.80024165, -0.11266672, 0.14250...","[Particle, Conversion, Amines, Ammonium Salts,...","[(Amines, Ammonium Salts)]","[(Particle, Rapid), (Conversion, Rapid), (Conv...","[[-0.5166602, 0.11687658, -0.09206407, -0.0107...","[([-0.34281886, 0.30733573, -0.13176091, 0.137...","[([-0.5166602, 0.11687658, -0.09206407, -0.010..."
2,33383989,Bisecting GlcNAc Protein N-Glycosylation Is Ch...,"[-0.040046073, -1.0489765, -0.37665504, -0.493...","[Protein N-Glycosylation, Adipogenesis, Human,...","[(Adipogenesis, Human)]","[(Adipogenesis, Protein N-Glycosylation), (Cha...","[[-0.7968025, -0.20326202, -0.566039, -0.13663...","[([-0.21563068, 0.16044843, -0.39353308, -0.15...","[([-0.21563068, 0.16044843, -0.39353308, -0.15..."
3,33383992,Association of Exposure to Cattle with Self-Re...,"[-0.310526, -0.5039563, 0.6601208, -0.08117764...","[relationship, human, study, bovine tuberculos...","[(health, human)]","[(health, relationship), (bovine tuberculosis,...","[[-0.25283906, 0.11320739, -0.30629858, -0.038...","[([0.006518982, 0.25178033, -0.0104505485, -0....","[([0.006518982, 0.25178033, -0.0104505485, -0...."
4,33383995,The Effect of Preoperative Video Based Pain Tr...,"[0.091471456, -0.07006314, -0.11821665, -0.306...","[Analgesic, Control Group, Effect, Video, Pain...","[(Postoperative Pain, Total Knee Arthroplasty)...","[(Analgesic, Postoperative Pain), (Effect, Vid...","[[-0.51707596, 0.17515965, -0.48447058, -0.293...","[([-0.38955075, 0.314926, -0.6473092, -0.47112...","[([-0.51707596, 0.17515965, -0.48447058, -0.29..."


In [20]:
from itertools import combinations

def generate_negative_samples(row):
    terms = set(row['terms'])
    pairs = {tuple(sorted(pair)) for pair in row['relations']}

    all_possible_negatives = {tuple(sorted(pair)) for pair in combinations(terms, 2)}

    negative_pairs = list(all_possible_negatives - pairs)

    negative_pair_embeddings = [(term_embeddings[pair[0]], term_embeddings[pair[1]]) for pair in negative_pairs]

    return pd.Series([negative_pairs, negative_pair_embeddings])

df[['negative_pairs', 'negative_pair_embeddings']] = df.progress_apply(generate_negative_samples, axis=1)

100%|██████████████████████████████████| 194880/194880 [03:20<00:00, 972.42it/s]


In [21]:
pos = 0
neg = 0
for idx, row in df.iterrows():
    num_pos = len(row['relations'])
    num_neg = len(row['negative_pairs'])
    pos += num_pos
    neg += num_neg

neg/pos

84.99117967344438

In [22]:
df.head()

Unnamed: 0,pmid,abstract,abstract_embeddings,terms,relations,term_embeddings,pair_embeddings,negative_pairs,negative_pair_embeddings
0,33383987,Temperature and Solvent Effects on H2 Splittin...,"[-0.28345045, -0.54292804, 0.16380529, 0.15494...","[unique, dependence, increase, kinetic barrier...","[(kinetic barrier, individual), (tetrahydrofur...","[[-0.17409305, 0.14344533, -0.08849688, 0.0577...","[([-0.3733754, 0.3167213, -0.109789714, 0.1208...","[(catalytic activity, degrees C), (splitting, ...","[([-0.5648823, 0.017226646, -0.004290537, -0.3..."
1,33383988,Mechanism for Rapid Conversion of Amines to Am...,"[-0.2850943, -0.80024165, -0.11266672, 0.14250...","[SA, amines, heterogeneous, more, Born, scales...","[(Amines, Ammonium Salts), (methylamine, sulfu...","[[-0.2324654, 0.15819652, 0.13121484, -0.16349...","[([-0.34281886, 0.30733573, -0.13176091, 0.137...","[(Rapid, methylamine), (effect, sulfuric acid)...","[([-0.5999069, 0.08327565, -0.2641165, -0.0731..."
2,33383989,Bisecting GlcNAc Protein N-Glycosylation Is Ch...,"[-0.040046073, -1.0489765, -0.37665504, -0.493...","[structural, Protein N-Glycosylation, total, m...","[(adipose-derived stem cells, cell types), (gl...","[[-0.36078623, 0.14125407, -0.13912392, -0.101...","[([-0.54336065, -0.51505405, -0.42755878, 0.51...","[(PGC-LC, glycan), (protein N, three), (adipoc...","[([-0.79083776, -0.288387, -0.16368967, -0.216..."
3,33383992,Association of Exposure to Cattle with Self-Re...,"[-0.310526, -0.5039563, 0.6601208, -0.08117764...","[One Health, bovine tuberculosis, study, human...","[(health, human)]","[[-0.25943512, -0.07382132, -0.3403496, -0.090...","[([0.006518982, 0.25178033, -0.0104505485, -0....","[(health, study), (One Health, health), (bovin...","[([0.006518982, 0.25178033, -0.0104505485, -0...."
4,33383995,The Effect of Preoperative Video Based Pain Tr...,"[0.091471456, -0.07006314, -0.11821665, -0.306...","[preoperative, video, pain levels, operation, ...","[(Total Knee Arthroplasty, Patients), (total k...","[[-0.34140453, -0.05907702, -0.2574163, -0.370...","[([-0.8438509, 0.14522868, -0.4953759, -0.1401...",[(non-randomized control group intervention st...,"[([-0.49223354, -0.864952, -0.7692929, -0.1749..."


In [32]:
chunk_size = 1000
output_dir = '60k_pmid_dataset_chunks_sentencewise_negatives'
os.makedirs(output_dir, exist_ok=True)

for i, start in enumerate(range(0, len(df), chunk_size)):
    df_chunk = df.iloc[start:start+chunk_size]
    chunk_path = os.path.join(output_dir, f"chunk_{i}.pkl")
    df_chunk.to_pickle(chunk_path)
    print(f"Saved {chunk_path}")

Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_0.pkl
Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_1.pkl
Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_2.pkl
Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_3.pkl
Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_4.pkl
Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_5.pkl
Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_6.pkl
Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_7.pkl
Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_8.pkl
Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_9.pkl
Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_10.pkl
Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_11.pkl
Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_12.pkl
Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_13.pkl
Saved 60k_pmid_dataset_chunks_sentencewise_negatives/chunk_14.pkl
Saved 60k_pmid_datas

In [61]:
len(df.iloc[0]['pair_embeddings'])

2

In [69]:
df.iloc[1000]['pair_embeddings'][0][0].shape

(768,)

In [70]:
df.head(1)

Unnamed: 0,pmid,abstract,abstract_embeddings,terms,relations,term_embeddings,pair_embeddings,negative_relations,negative_relation_embeddings
0,33439050,Radiohistologic Comparison Study of Temporal B...,"[0.1367508, -0.20444079, -0.7152996, -0.035280...","[specimen, data, inner, Cochlear Implant, coch...","[(spiral lamina, osseous), (bone structures, o...","[[-0.55541235, -0.023848163, -0.04390437, -0.0...","[([-0.3612353, -0.29702, 0.16018414, -0.321839...","[(Cone-Beam CT, implantation), (data, images),...","[([-0.7141436, -0.29975635, 0.07302878, -0.255..."
