In [1]:
def assemble_ner_output_flat(ner_outputs):
    entities = []
    current_entity = None
    # Create a copy of the ner_outputs to avoid modifying the original
    ner_outputs = ner_outputs.copy()

    for token in ner_outputs:
        if token["label"] == "O":
            continue
        label = token["label"].replace("B-", "").replace("I-", "")
        word = token["word"].replace("Ġ", "")
        score = token["score"]

        if current_entity is None:
            # Start a new entity
            current_entity = {
                "label": label,
                "text": word.strip(),
                "start": token["start"],
                "end": token["end"],
                "score": [score],
            }
        elif current_entity["label"] == label and token["start"] == current_entity["end"]:
            # Continue current entity (must be same label and consecutive)
            current_entity["text"] += word
            current_entity["end"] = token["end"]
            current_entity["score"].append(score)
        elif current_entity["label"] == label and token["start"] == (current_entity["end"] + 1):
            # Continue current entity (must be same label and consecutive)
            current_entity["text"] += " " + word
            current_entity["end"] = token["end"]
            current_entity["score"].append(score)
        else:
            # Save current and start new
            entities.append(current_entity)
            current_entity = {
                "label": label,
                "text": word.strip(),
                "start": token["start"],
                "end": token["end"],
                "score": [score],
            }

    if current_entity:
        entities.append(current_entity)

    # Average the scores
    for ent in entities:
        ent["score"] = sum(ent["score"]) / len(ent["score"])

    return entities

def process_ner_results(ner_results):
    tag2label = {
        "O": 0,
        "B-DISEASE": 1,
        "I-DISEASE": 2,
        "B-MEDICATION": 3,
        "I-MEDICATION": 4,
        "B-PROCEDURE": 5,
        "I-PROCEDURE": 6,
        "B-SYMPTOM": 7,
        "I-SYMPTOM": 8,
    }

    label2tag = {v: k for k, v in tag2label.items()}

    proc_results = []
    for result in ner_results:
        if result['entity'] != 'O':
            proc_results.append({
                'start': result['start'],
                'end': result['end'],
                'label': label2tag[int(result['entity'].split('_')[-1])],
                'score': result['score'],
                'word': result['word'],
            })
            
    return proc_results

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("/gpfs/projects/bsc14/code/CardioNER/output/en/cpt_biomed_roberta_token/E120BS32LR1e4/hf")
model = AutoModelForTokenClassification.from_pretrained("/gpfs/projects/bsc14/code/CardioNER/output/en/cpt_biomed_roberta_token/E120BS32LR1e4/hf")

pipe = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    device=1,  # Use GPU if available
)

Device set to use cuda:1


In [3]:
example1 = """
A 74-year-old man underwent primary percutaneous coronary intervention (pPCI) for a completely occluded proximal left anterior descending artery. During the procedure, microvascular flow distal to the block could not be established (no-reflow phenomenon) and was managed with diltiazem and nicorandil. A follow-up echocardiogram performed on the next day revealed an intraparietal echolucent apicolateral neocavitation corresponding to an intramyocardial haematoma (IMH) along with a reduced ejection fraction (EF) of 30\% compared with 45\% at the time of intervention. At 1 month follow-up, he developed congestive heart failure with a further decline in EF to 25\%. Although the IMH was resolving, the left ventricular apex had ballooned out. At 3 months, he was rehospitalised with a storm of ventricular tachycardia, further deterioration in EF with an apical aneurysm, and eventually passed away.
"""

example2 = """
**Discharge Summary**

**Patient Information:**
- **Name:** [Patient Name]
- **Age:** 74 years
- **Gender:** Male

**Admission Date:** [Admission Date]  
**Discharge Date:** [Discharge Date]  
**Attending Physician:** [Physician Name]  

**Diagnosis:**
- Complete occlusion of proximal left anterior descending artery
- Intramyocardial hematoma (IMH)
- Congestive heart failure
- Ventricular tachycardia

**Procedure:**
The patient underwent primary percutaneous coronary intervention (pPCI) for a completely occluded proximal left anterior descending artery. During the procedure, the no-reflow phenomenon was encountered, and management included administration of diltiazem and nicorandil.

**Post-Procedure Findings:**
A follow-up echocardiogram conducted the day after the procedure indicated the presence of an intraparietal echolucent apicolateral neocavitation, consistent with an intramyocardial hematoma (IMH). The patient's ejection fraction (EF) was noted to be 30%, a decline from 45% recorded during the intervention.

**Follow-Up:**
At the one-month follow-up, the patient developed congestive heart failure, with further deterioration in EF to 25%.Although the IMH showed signs of resolution, there was notable ballooning of the left ventricular apex.

**Rehospitalization:**
At three months post-procedure, the patient was rehospitalized due to a storm of ventricular tachycardia and additional decline in EF, accompanied by the development of an apical aneurysm.

**Outcome:**
Despite medical interventions, the patient ultimately passed away.

**Recommendations:**
- Continued monitoring and management of cardiac function.
- Consideration for advanced heart failure therapies.

**Prepared by:** [Your Name]  
**Date:** [Date]
"""

In [4]:
# Load a generative model
import torch
from transformers import pipeline

model_id = "/gpfs/projects/bsc14/abecerr1/hub/models--meta-llama--Llama-3.2-3B-Instruct/snapshots/0cb88a4f764b7a12671c53f0838cd831a0843b95"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    # device="cuda",
    device_map="auto",
)

terminators = [
    pipe.tokenizer.eos_token_id,
    pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [5]:

def llm_abv_extractor(pipe, text):

    instruction = "Create a dictionary with the abbreviations and their corresponding full form if found in the text."
    instruction += " The dictionary should be in the format: {'<abbreviation>': '<full form>'}."
    instruction += "Don't add anything else or any comments. Apply it only to abbreviations. Make sure to include all of them and that they are really abbreviations."
    instruction += "If no abbreviations are found, return an empty dictionary."
    instruction += "Negative examples: 'The patient was diagnosed with diabetes mellitus.' should return an empty dictionary."
    prompt = text
    # print(prompt)

    messages = [
        {
            "role": "system",
            "content": instruction
        },
        {
            "role": "user",
            "content": prompt
        }
    ]

    outputs = pipe(
                    messages,
                    max_new_tokens=512,
                    temperature=0.01,
                    repetition_penalty=1.2,
                    # do_sample=True,
                    eos_token_id=terminators,
                    pad_token_id=pipe.tokenizer.eos_token_id,
                )

    
    return outputs[0]["generated_text"][-1]["content"]


In [6]:

abbr_dict = llm_abv_extractor(pipe, example1)
print(abbr_dict)


{'pPCI': 'primary percutaneous coronary intervention', 
'IMH': 'intrap myocardial hematoma'}


In [11]:
import pandas as pd
import os
from src.data import files_to_df

SOURCE_PATH_ORIG = "data/1_original/txt"
SOURCE_PATH_GEN = "data/2_generated/2step_transformation_dt4h_GPT4omini/en"

def create_pairs(path_orig, path_gen):

    df_gen = files_to_df(path_gen)
    df_gen["filenameid"] = df_gen["filenameid"].str.replace("_transformed_step1", "")

    df_orig = files_to_df(path_orig)
    df_pairs = df_orig.merge(df_gen, on="filenameid", suffixes=("_orig", "_gen"))
    # assert len(df_pairs) == N_EXPECTED_SAMPLES, f"Expected {N_EXPECTED_SAMPLES} samples, got {len(df_pairs)}"

    df_pairs.rename(columns={"text_orig": "clinical_case", "text_gen": "discharge_summary"}, inplace=True)
    return df_pairs

df_pairs = create_pairs(SOURCE_PATH_ORIG, SOURCE_PATH_GEN)

In [22]:
from tqdm import tqdm
tqdm.pandas(desc="Processing")
import json

# df_pairs["abbr_dict"] = df_pairs["clinical_case"].progress_apply(lambda x: llm_abv_extractor(pipe, x))

#### Generate abbreviation dictionary
# df_pairs["abbr_dict"] = df_pairs["clinical_case"].progress_apply(lambda x: llm_abv_extractor(pipe, x))

# import ast
# import json
# ls_dicts_out = []

# for i, row in tqdm(df_pairs.iterrows()):
#     try:
#         abbr_dict = ast.literal_eval(row["abbr_dict"])
#     except:
#         try:
#             abbr_dict = eval(row["abbr_dict"])
#         except:
#             try:
#                 abbr_dict = json.loads(row["abbr_dict"])
#             except:
#                 print(f"Error processing row {i}: {row['abbr_dict']}")
#                 continue
            
#     ls_dicts_out.append(abbr_dict)        
    
# abbv_dict = {}

# for i in ls_dicts_out:
#     abbv_dict.update(i)
    
# d_out = {}
# for k, v in abbv_dict.items():
#     if k is not None and v is not None:
#         if len(list(set(k.lower().split()) & set(v.lower().split()))) == 0:
#             if len(k.split()) <= 2:
#                 d_out[k] = v
        
# json.dump(d_out, open("data/5_abbreviations/cardio_abbv.json", "w"), indent=4)

d_abbrev = json.load(open("data/5_abbreviations/cardio_abbv.json", "r"))
d_abbrev

{'ECG': 'electrocardiogram',
 'IgG': 'Immunoglobulin G',
 'HHT': 'Hereditary Hemorrhagic Telangiectasia',
 'NYHA': 'New York Heart Association',
 'NKF': 'National Kidney Foundation',
 'STEMI': 'ST-Segment Elevation Myocardial Infarction',
 'PCI': 'Percutaneous Coronary Intervention',
 'LAD': 'Left Anterior Descending',
 'RCA': 'right coronary artery',
 'CX': 'Circumflex',
 'IVABRADEINE': 'Anti-Arrhythm Drug Ivabradine',
 'CRX': 'chest radiograph',
 'HRCT': 'High-Resolution Computed Tomography',
 'PP': 'pleural plaques',
 'LVEF': 'Left Ventricular Ejection Fraction',
 'CO': 'Carbon Monoxide',
 'Fr': 'French unit',
 'iVAC2L': 'Intra-Aortic Counterpulsation Device',
 'HF': 'Heart Failure',
 'LVD': 'Left Ventricular Assist Device',
 'LVAD': 'Left Ventricular Assist Device',
 'INR': 'International Normalized Ratio',
 'CMV': 'Cytomegalovirus',
 'MTA': 'Thrombotic Microangiopathy',
 'GSA': '',
 'HIV': 'Human Immunodeficiency Virus',
 'PA': 'Pulmonary Artery',
 'PGML': 'picogolulin-like protei

In [11]:
ner_results_1 = pipe(example1)
ner_results_2 = pipe(example2)

proc_results_1 = process_ner_results(ner_results_1)
proc_results_2 = process_ner_results(ner_results_2)

results_1 = assemble_ner_output_flat(proc_results_1)
results_2 = assemble_ner_output_flat(proc_results_2)


In [12]:
results_2

[{'label': 'DISEASE',
  'text': 'Complete occlusion of proximal',
  'start': 256,
  'end': 286,
  'score': 0.7898994258471898},
 {'label': 'DISEASE',
  'text': 'anterior',
  'start': 292,
  'end': 300,
  'score': 0.5233062505722046},
 {'label': 'DISEASE',
  'text': 'Intramyocardial hematoma',
  'start': 321,
  'end': 345,
  'score': 0.9866578802466393},
 {'label': 'DISEASE',
  'text': 'IMH',
  'start': 347,
  'end': 350,
  'score': 0.4332329034805298},
 {'label': 'DISEASE',
  'text': 'Congestive heart failure',
  'start': 354,
  'end': 378,
  'score': 0.9912551283836365},
 {'label': 'DISEASE',
  'text': 'Ventricular tachycardia',
  'start': 381,
  'end': 404,
  'score': 0.9940839409828186},
 {'label': 'PROCEDURE',
  'text': 'primary percutaneous coronary intervention',
  'start': 443,
  'end': 485,
  'score': 0.5631036758422852},
 {'label': 'PROCEDURE',
  'text': 'pPCI',
  'start': 487,
  'end': 491,
  'score': 0.5993131399154663},
 {'label': 'DISEASE',
  'text': 'completely occluded p

In [13]:
import pandas as pd
df_res_1 = pd.DataFrame(results_1)[["label", "text"]].drop_duplicates()
df_res_1["in_cc"] = 1
df_res_1

Unnamed: 0,label,text,in_cc
0,PROCEDURE,primary percutaneous coronary intervention,1
1,PROCEDURE,pPCI,1
2,DISEASE,occluded proximal left anterior descending artery,1
3,MEDICATION,diltiazem,1
4,MEDICATION,nicorandil,1
5,PROCEDURE,echocardiogram,1
6,DISEASE,intramyocardial haematoma,1
7,DISEASE,IMH,1
8,DISEASE,congestive heart failure,1
9,DISEASE,H,1


In [14]:
df_res_2 = pd.DataFrame(results_2)[["label", "text"]].drop_duplicates()
df_res_2["in_ds"] = 1
df_res_2

Unnamed: 0,label,text,in_ds
0,DISEASE,Complete occlusion of proximal,1
1,DISEASE,anterior,1
2,DISEASE,Intramyocardial hematoma,1
3,DISEASE,IMH,1
4,DISEASE,Congestive heart failure,1
5,DISEASE,Ventricular tachycardia,1
6,PROCEDURE,primary percutaneous coronary intervention,1
7,PROCEDURE,pPCI,1
8,DISEASE,completely occluded proximal left anterior des...,1
9,MEDICATION,diltiazem,1


In [15]:
df_tot = df_res_2.merge(df_res_1, on=["label", "text"], how="outer").fillna(0)
df_tot

Unnamed: 0,label,text,in_ds,in_cc
0,DISEASE,Complete occlusion of proximal,1.0,0.0
1,DISEASE,Congestive heart failure,1.0,0.0
2,DISEASE,H,0.0,1.0
3,DISEASE,IMH,1.0,1.0
4,DISEASE,Intramyocardial hematoma,1.0,0.0
5,DISEASE,Ventricular tachycardia,1.0,0.0
6,DISEASE,anterior,1.0,0.0
7,DISEASE,apical aneurysm,1.0,1.0
8,DISEASE,completely occluded proximal left anterior des...,1.0,0.0
9,DISEASE,congestive heart failure,1.0,1.0


In [16]:
df_tot[df_tot["in_cc"] + df_tot["in_ds"] <= 1]

Unnamed: 0,label,text,in_ds,in_cc
0,DISEASE,Complete occlusion of proximal,1.0,0.0
1,DISEASE,Congestive heart failure,1.0,0.0
2,DISEASE,H,0.0,1.0
4,DISEASE,Intramyocardial hematoma,1.0,0.0
5,DISEASE,Ventricular tachycardia,1.0,0.0
6,DISEASE,anterior,1.0,0.0
8,DISEASE,completely occluded proximal left anterior des...,1.0,0.0
10,DISEASE,heart failure,1.0,0.0
11,DISEASE,intramyocardial haematoma,0.0,1.0
12,DISEASE,intramyocardial hematoma,1.0,0.0


In [17]:
import re
df_proc_gaz = pd.read_csv("data/4_gazetteers/en/english_procedures_gazetteer.tsv", sep="\t")
df_proc_gaz[df_proc_gaz["term"].apply(lambda x: re.match("^[A-Z]{1,5}\s\(.{3,}\).+$", x) is not None)]

Unnamed: 0,code,language,term,semantic_tag,parents,mainterm
2789,1335957003,en,AAV (Adeno-associated virus) antibody,substance,[77767001],0
2815,698451006,en,ABPI (Ankle brachial pressure index) test decl...,situation,[1296859006],0
2817,1230374008,en,ABR (auditory brainstem response) with click,procedure,[252616000],0
2818,1230375009,en,ABR (auditory brainstem response) with tone,procedure,[252616000],0
2829,233169004,en,ACD (automatic cardiac defibrillator) procedure,procedure,[425934009],0
...,...,...,...,...,...,...
225409,15218003,en,WISC (Wechsler Intelligence Scale for Children...,procedure,[72657008],0
225410,237371007,en,WLE (wide local excision) of breast lesion,procedure,"[392023007, 787439005, 773170003]",0
225411,1231286004,en,WLE (wide local excision) of lesion of breast ...,procedure,"[237371007, 710955000]",0
227095,833324006,en,YAG (yttrium aluminum garnet) laser vitreolysis,procedure,"[17348004, 122459003, 608849003, 118903006]",0


In [18]:
df_dis_gaz = pd.read_csv("data/4_gazetteers/en/english_diseases_gazetteer.tsv", sep="\t")
df_dis_gaz[df_dis_gaz["term"].apply(lambda x: re.match("^[A-Z]{1,5}\s\(.{3,}\).+$", x) is not None)]


Unnamed: 0,code,language,term,semantic_tag,parents,mainterm
570,246991003,en,AC (anterior chamber) deep,finding,[301933003],0
582,720979002,en,"ACD (alopecia, contracture, dwarfism) mental r...",disorder,"[8654005, 23359005, 85995004, 363185004, 11035...",0
583,720979002,en,"ACD (alopecia, contracture, dwarfism) syndrome",disorder,"[8654005, 23359005, 85995004, 363185004, 11035...",0
595,1345055005,en,ACF (asymmetric crying facies) syndrome,disorder,"[11164009, 205530002, 363070008, 363212003, 65...",0
597,1306301007,en,ACG (angle closure glaucoma) of left eye,disorder,"[12239461000119106, 392291006]",0
...,...,...,...,...,...,...
186711,726572009,en,WBC (white blood cell) cast,morphologic abnormality,"[725685009, 734910007]",0
186727,1186725001,en,"WILD (warts, immunodeficiency, lymphedema, ano...",disorder,"[254199006, 58606001, 699346009, 128937004, 29...",0
186728,1186725001,en,"WILD (warts, immunodeficiency, lymphoedema, an...",disorder,"[254199006, 58606001, 699346009, 128937004, 29...",0
186731,733909005,en,WNT (Wingless) activated medulloblastoma,morphologic abnormality,[1156923005],0


In [19]:
df_sym_gaz = pd.read_csv("data/4_gazetteers/en/english_symptoms_gazetteer.tsv", sep="\t")
df_sym_gaz[df_sym_gaz["term"].apply(lambda x: re.match("^[A-Z]{1,5}\s\(.{3,}\).+$", x) is not None)]

Unnamed: 0,code,language,term,semantic_tag,parents,mainterm
717,246991003,en,AC (anterior chamber) deep,finding,[301933003],0
728,720979002,en,"ACD (alopecia, contracture, dwarfism) mental r...",disorder,"[8654005, 23359005, 85995004, 363185004, 11035...",0
729,720979002,en,"ACD (alopecia, contracture, dwarfism) syndrome",disorder,"[8654005, 23359005, 85995004, 363185004, 11035...",0
742,1345055005,en,ACF (asymmetric crying facies) syndrome,disorder,"[11164009, 205530002, 363070008, 363212003, 65...",0
744,1306301007,en,ACG (angle closure glaucoma) of left eye,disorder,"[12239461000119106, 392291006]",0
...,...,...,...,...,...,...
224756,726572009,en,WBC (white blood cell) cast,morphologic abnormality,"[725685009, 734910007]",0
224793,1186725001,en,"WILD (warts, immunodeficiency, lymphedema, ano...",disorder,"[254199006, 58606001, 699346009, 128937004, 29...",0
224794,1186725001,en,"WILD (warts, immunodeficiency, lymphoedema, an...",disorder,"[254199006, 58606001, 699346009, 128937004, 29...",0
224797,733909005,en,WNT (Wingless) activated medulloblastoma,morphologic abnormality,[1156923005],0


In [47]:
dis_terms = df_dis_gaz["term"].unique()
df_sym_gaz["term"].isin(dis_terms).sum()

182741

In [2]:
from sentence_transformers import SentenceTransformer
model_st = SentenceTransformer("/gpfs/projects/bsc14/abecerr1/hub/models--cambridgeltl--SapBERT-from-PubMedBERT-fulltext-mean-token/snapshots/9f95c2e962719c70f25bf7a1f33bd8d9e9448750", device="cuda:1")
# faiss_index = FaissIndex(model=model_st, gazetteer_path=path_to_gaz, random_seed=SEED, training_data_path=TRAINING_DATA)


No sentence-transformers model found with name /gpfs/projects/bsc14/abecerr1/hub/models--cambridgeltl--SapBERT-from-PubMedBERT-fulltext-mean-token/snapshots/9f95c2e962719c70f25bf7a1f33bd8d9e9448750. Creating a new one with mean pooling.


In [3]:
from src.search_index import FaissIndex
faiss_index = FaissIndex(model=model_st, gazetteer_path="data/4_gazetteers/en/english_procedures_gazetteer.tsv", random_seed=0)


In [4]:
faiss_index.generate_search_index()

Batches:   0%|          | 0/7141 [00:00<?, ?it/s]

In [23]:
query_embs_1 = model_st.encode(df_res_1["text"].tolist())
query_embs_2 = model_st.encode(df_res_2["text"].tolist())


In [38]:
_, I1 = faiss_index.search(query_embs_1, k=10)
_, I2 = faiss_index.search(query_embs_2, k=10)

In [39]:
ls_codes_1 = [[faiss_index.get_code_by_index(i) for i in row] for row in I1]
ls_terms_1 = [[faiss_index.get_term_by_index(i) for i in row] for row in I1]

df_res_1["faiss_code"] = ls_codes_1
df_res_1["faiss_term"] = ls_terms_1

ls_codes_2 = [[faiss_index.get_code_by_index(i) for i in row] for row in I2]
ls_terms_2 = [[faiss_index.get_term_by_index(i) for i in row] for row in I2]

df_res_2["faiss_code"] = ls_codes_2
df_res_2["faiss_term"] = ls_terms_2

In [46]:
df_res_1.iloc[7]["faiss_term"]

['History of idiopathic intracranial hypertension',
 'Hypothalamic inhibiting factor',
 'ICSH',
 'IM - Intramuscular induction',
 'Intermittent haemofiltration',
 'IM - Intramuscular injection',
 'IMI - Intramuscular injection',
 'Intermittent hemofiltration',
 'Imidazole',
 'Implantable infusion pump']

In [43]:
df_tot = df_res_1.merge(df_res_2, on=["label", "text"], how="outer",suffixes=("_cc", "_ds")).fillna(0)
df_tot

Unnamed: 0,label,text,in_cc,faiss_code_cc,faiss_term_cc,in_ds,faiss_code_ds,faiss_term_ds
0,DISEASE,Complete occlusion of proximal,0.0,0,0,1.0,"[243208002, 410488006, 278683009, 243209005, 4...",[Orthoptic full time occlusion treatment - tot...
1,DISEASE,Congestive heart failure,0.0,0,0,1.0,"[433305001, 161505003, 395105005, 134378009, 3...","[Family history of congestive heart failure, H..."
2,DISEASE,H,1.0,"[2141009, 2141009, 115744003, 91137000, 700500...","[Hydrogen, H - Hydrogen, H antigen, Heavy hydr...",0.0,0,0
3,DISEASE,IMH,1.0,"[16235071000119108, 44681007, 64182005, 241690...",[History of idiopathic intracranial hypertensi...,1.0,"[16235071000119108, 44681007, 64182005, 241690...",[History of idiopathic intracranial hypertensi...
4,DISEASE,Intramyocardial hematoma,0.0,0,0,1.0,"[36843004, 149213005, 281753007, 36843004, 830...","[Injection of heart, Internal cardiac massage,..."
5,DISEASE,Ventricular tachycardia,0.0,0,0,1.0,"[870252007, 429046004, 1098861000119103, 24315...","[Ventricular tachycardia ablation, History of ..."
6,DISEASE,anterior,0.0,0,0,1.0,"[257481004, 39845007, 264781008, 224748006, 17...","[Anterior lead, Anterior repair, AB, Front doo..."
7,DISEASE,apical aneurysm,1.0,"[233087003, 80377005, 80377005, 10615008, 4902...","[Left ventricular aneurysm operation, Aneurysm...",1.0,"[233087003, 80377005, 80377005, 10615008, 4902...","[Left ventricular aneurysm operation, Aneurysm..."
8,DISEASE,completely occluded proximal left anterior des...,0.0,0,0,1.0,"[1217158001, 86642005, 175212004, 1217158001, ...",[Anastomosis of LIMA (left internal mammary ar...
9,DISEASE,congestive heart failure,1.0,"[433305001, 161505003, 395105005, 134378009, 3...","[Family history of congestive heart failure, H...",1.0,"[433305001, 161505003, 395105005, 134378009, 3...","[Family history of congestive heart failure, H..."


In [33]:
df_res_1["faiss_code"] = ls_codes
df_res_1["faiss_term"] = ls_terms

In [34]:
df_res_1

Unnamed: 0,label,text,in_cc,faiss_code,faiss_term
0,PROCEDURE,primary percutaneous coronary intervention,1,"[415070008, 415070008, 405741001, 414089002, 8...","[Percutaneous coronary intervention, PCI - Per..."
1,PROCEDURE,pPCI,1,"[415070008, 415070008, 405741001, 103810006, 1...","[Percutaneous coronary intervention, PCI - Per..."
2,DISEASE,occluded proximal left anterior descending artery,1,"[86642005, 1217158001, 1217158001, 1217158001,...",[Anastomosis of descending aorta to left pulmo...
3,MEDICATION,diltiazem,1,"[372793000, 59941008, 59941008, 105163002, 122...","[Diltiazem, Product containing diltiazem, Dilt..."
4,MEDICATION,nicorandil,1,"[395809002, 319304004, 319304004, 387457003, 3...","[Nicorandil, Product containing nicorandil, Ni..."
5,PROCEDURE,echocardiogram,1,"[40701008, 40701008, 40701008, 40701008, 43323...","[Echocardiogram, Echocardiography, Echocardiog..."
6,DISEASE,intramyocardial haematoma,1,"[36843004, 149213005, 36843004, 281753007, 226...","[Injection of heart, Internal cardiac massage,..."
7,DISEASE,IMH,1,"[16235071000119108, 44681007, 64182005, 241690...",[History of idiopathic intracranial hypertensi...
8,DISEASE,congestive heart failure,1,"[433305001, 161505003, 395105005, 134378009, 3...","[Family history of congestive heart failure, H..."
9,DISEASE,H,1,"[2141009, 2141009, 115744003, 91137000, 700500...","[Hydrogen, H - Hydrogen, H antigen, Heavy hydr..."


In [19]:
df_res_1.merge(df_proc_gaz, left_on=["text"], right_on=["term"], how="left")

Unnamed: 0,label,text,in_cc,code,language,term,semantic_tag,parents,mainterm
0,PROCEDURE,primary percutaneous coronary intervention,1,,,,,,
1,PROCEDURE,pPCI,1,,,,,,
2,DISEASE,occluded proximal left anterior descending artery,1,,,,,,
3,MEDICATION,diltiazem,1,,,,,,
4,MEDICATION,nicorandil,1,,,,,,
5,PROCEDURE,echocardiogram,1,,,,,,
6,DISEASE,intramyocardial haematoma,1,,,,,,
7,DISEASE,IMH,1,,,,,,
8,DISEASE,congestive heart failure,1,,,,,,
9,DISEASE,H,1,,,,,,


In [76]:
import numpy as np
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel  


# replace with your own list of entity names
all_names = ["covid-19", "Coronavirus infection", "high fever", "Tumor of posterior wall of oropharynx"] 

def encode_names(names, device="cuda:0"):
    """
    Encode a list of names using the tokenizer.
    """
    
    tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")  
    model = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext").to(device)
    
    encodings = tokenizer.batch_encode_plus(names, 
                                            padding="max_length", 
                                            max_length=25, 
                                            truncation=True,
                                            return_tensors="pt")
    
    
    # return encodings["input_ids"]


    matrix = model(encodings["input_ids"].to(device), 
            attention_mask=encodings["attention_mask"].to(device))    
    
    
    return matrix[0][:, 0, :].cpu().detach().numpy()
# all_embs = np.concatenate(all_embs, axis=0)
# all_embs

def encode_batch(batch, device="cuda:0"):
    """
    Encode a batch of names using the tokenizer.
    """
    
    tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")  
    model = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext").to(device)
    
    encodings = tokenizer.batch_encode_plus(batch, 
                                            padding="max_length", 
                                            max_length=25, 
                                            truncation=True,
                                            return_tensors="pt")
    
    
    # return encodings["input_ids"]


    matrix = model(encodings["input_ids"].to(device), 
            attention_mask=encodings["attention_mask"].to(device))    
    
    
    return matrix[0][:, 0, :].cpu().detach().numpy()

In [74]:
from datasets import Dataset
ds = Dataset.from_pandas(df_proc_gaz)

In [78]:
ds = ds.map(lambda x: {"embeddings": encode_batch(x["term"])}, batched=True, batch_size=1024)

Map:   0%|          | 0/228500 [00:00<?, ? examples/s]

In [82]:
emb = torch.tensor(ds["embeddings"])

: 

In [14]:
import pandas as pd
import os
import en_ner_bc5cdr_md
from spacy.language import Language
from scispacy.linking import EntityLinker
from tqdm import tqdm
tqdm.pandas()

import sys
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
# sys.path.append("../src")
# print(os.getcwd())

from src.data import files_to_df
from src.scispacy_ie import entity_linker


# print("There are {} original and {} generated samples".format(len(df_orig), len(df_gen)))

nlp = en_ner_bc5cdr_md.load()

try:    # Register the EntityLinker component
    @Language.factory("umls_linker")
    def create_umls_linker(nlp, name):
        return EntityLinker(k=10, max_entities_per_mention=5, name="umls")
    nlp.add_pipe("umls_linker")
    
except ValueError:
    print("Entity linker already exists")
    
# df_ents_orig = df_orig.set_index("filenameid")["text"].swifter.apply(lambda x: entity_linker(nlp, x)).explode().apply(pd.Series)
# df_ents_orig.columns = ["span", "mention_class", "code", "term"]
# df_ents_orig.reset_index(inplace=True)
# df_ents_orig.to_csv(os.path.join(output_path_orig, "ents_orig_scispacy.csv"), index=False)


# df_ents_gen = df_gen.set_index("filenameid")["text"].swifter.apply(lambda x: entity_linker(nlp, x)).explode().apply(pd.Series)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [30]:
def entity_linker(nlp, text, linker_name="umls_linker"):
    doc = nlp(text)
    linker = nlp.get_pipe(linker_name)
    return [(ent.text, 
             ent.label_, 
             [x[0] for x in ent._.kb_ents] if ent._.kb_ents != [] else None, 
             linker.kb.cui_to_entity[ent._.kb_ents[0][0]].canonical_name\
        if ent._.kb_ents != [] else None) for ent in doc.ents]
    

In [31]:
pd.DataFrame(entity_linker(nlp, example1))

Unnamed: 0,0,1,2,3
0,no-reflow,DISEASE,[C0232347],No-Reflow Phenomenon
1,diltiazem,CHEMICAL,"[C0012373, C0282138]",diltiazem
2,nicorandil,CHEMICAL,[C0068700],nicorandil
3,intramyocardial haematoma,DISEASE,,
4,IMH,DISEASE,"[C3544398, C0027466]",stress-induced mitochondrial fusion
5,congestive heart failure,DISEASE,[C0018802],Congestive heart failure
6,IMH,DISEASE,"[C3544398, C0027466]",stress-induced mitochondrial fusion
7,ventricular tachycardia,DISEASE,"[C0042514, C0344428]","Tachycardia, Ventricular"
8,aneurysm,DISEASE,"[C0002940, C0751003]",Aneurysm


In [18]:
pd.DataFrame(entity_linker(nlp, example2))

Unnamed: 0,0,1,2,3
0,hematoma,DISEASE,C0018944,Hematoma
1,IMH,DISEASE,C3544398,stress-induced mitochondrial fusion
2,Ventricular tachycardia,DISEASE,C0042514,"Tachycardia, Ventricular"
3,diltiazem,CHEMICAL,C0012373,diltiazem
4,nicorandil,CHEMICAL,C0068700,nicorandil
5,intramyocardial hematoma,DISEASE,,
6,IMH,DISEASE,C3544398,stress-induced mitochondrial fusion
7,congestive heart failure,DISEASE,C0018802,Congestive heart failure
8,IMH,DISEASE,C3544398,stress-induced mitochondrial fusion
9,ventricular tachycardia,DISEASE,C0042514,"Tachycardia, Ventricular"


In [23]:
df_res_2

Unnamed: 0,label,text,in_ds
0,DISEASE,Complete occlusion of proximal,1
1,DISEASE,anterior,1
2,DISEASE,Intramyocardial hematoma,1
3,DISEASE,IMH,1
4,DISEASE,Congestive heart failure,1
5,DISEASE,Ventricular tachycardia,1
6,PROCEDURE,primary percutaneous coronary intervention,1
7,PROCEDURE,pPCI,1
8,DISEASE,completely occluded proximal left anterior des...,1
9,MEDICATION,diltiazem,1


In [78]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print(classification_report(df_tot["in_ds"], df_tot["in_cc"]))



              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         3
         1.0       0.75      0.50      0.60        18

    accuracy                           0.43        21
   macro avg       0.38      0.25      0.30        21
weighted avg       0.64      0.43      0.51        21



In [79]:
import numpy as np
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel  

tokenizer = AutoTokenizer.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext")  
model = AutoModel.from_pretrained("cambridgeltl/SapBERT-from-PubMedBERT-fulltext").cuda()

# replace with your own list of entity names
all_names = ["covid-19", "Coronavirus infection", "high fever", "Tumor of posterior wall of oropharynx"] 

bs = 128 # batch size during inference
all_embs = []
for i in tqdm(np.arange(0, len(all_names), bs)):
    toks = tokenizer.batch_encode_plus(all_names[i:i+bs], 
                                       padding="max_length", 
                                       max_length=25, 
                                       truncation=True,
                                       return_tensors="pt")
    toks_cuda = {}
    for k,v in toks.items():
        toks_cuda[k] = v.cuda()
    cls_rep = model(**toks_cuda)[0][:,0,:] # use CLS representation as the embedding
    all_embs.append(cls_rep.cpu().detach().numpy())

all_embs = np.concatenate(all_embs, axis=0)
all_embs

tokenizer_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

array([[-0.64542013, -0.38714704, -0.21302737, ...,  0.23982963,
         0.80409414,  0.423348  ],
       [-1.188549  , -0.27455914,  0.28520218, ..., -0.12636967,
         0.8062242 ,  0.06786634],
       [-0.13091153,  0.4398695 , -0.14277072, ..., -0.317846  ,
         0.37847674,  0.15484798],
       [-0.86763877,  0.00546662, -0.38052037, ...,  0.18259029,
         0.8963129 , -0.2953902 ]], dtype=float32)

In [None]:
# def assemble_ner_output_iob(ner_outputs):
#     entities = []
#     current_entity = None

#     for token in ner_outputs:
#         label = token["label"]
#         word = token["word"].replace("Ġ", " ")  # Handle byte-level tokens (e.g., RoBERTa)
#         score = token["score"]

#         if label.startswith("B-"):
#             if current_entity:
#                 # Save previous entity
#                 entities.append(current_entity)
#             # Start new entity
#             current_entity = {
#                 "label": label[2:],
#                 "text": word.strip(),
#                 "start": token["start"],
#                 "end": token["end"],
#                 "score": [score],
#             }
#         elif label.startswith("I-") and current_entity and label[2:] == current_entity["label"]:
#             # Continue current entity
#             current_entity["text"] += word
#             current_entity["end"] = token["end"]
#             current_entity["score"].append(score)
#         else:
#             if current_entity:
#                 entities.append(current_entity)
#                 current_entity = None
#             # You could also handle 'O' labels here if needed

#     if current_entity:
#         entities.append(current_entity)

#     # Average the scores
#     for ent in entities:
#         ent["score"] = sum(ent["score"]) / len(ent["score"])

#     return entities




assemble_ner_output_flat(proc_results)

[{'label': 'SYMPTOM',
  'text': 'chest pain',
  'start': 23,
  'end': 33,
  'score': 0.9915495216846466},
 {'label': 'DISEASE',
  'text': 'revascularised ischaemic heart disease',
  'start': 68,
  'end': 106,
  'score': 0.9271806627511978},
 {'label': 'DISEASE',
  'text': 'drug allergies',
  'start': 161,
  'end': 175,
  'score': 0.9905887842178345},
 {'label': 'DISEASE',
  'text': 'toxic habits',
  'start': 180,
  'end': 192,
  'score': 0.988537460565567},
 {'label': 'DISEASE',
  'text': 'Diabetes mellitus type 2',
  'start': 194,
  'end': 218,
  'score': 0.9867602944374084},
 {'label': 'DISEASE',
  'text': 'Hypercholesterolemia type dyslipidaemia',
  'start': 248,
  'end': 287,
  'score': 0.96442209482193},
 {'label': 'PROCEDURE',
  'text': 'hygienic-dietary measures',
  'start': 308,
  'end': 333,
  'score': 0.9714225729306539},
 {'label': 'SYMPTOM',
  'text': 'HIV positive',
  'start': 335,
  'end': 347,
  'score': 0.6786492764949799},
 {'label': 'DISEASE',
  'text': 'Pneumocystis 