In [1]:
def assemble_ner_output_flat(ner_outputs):
    entities = []
    current_entity = None
    # Create a copy of the ner_outputs to avoid modifying the original
    ner_outputs = ner_outputs.copy()

    for token in ner_outputs:
        if token["label"] == "O":
            continue
        label = token["label"].replace("B-", "").replace("I-", "")
        word = token["word"].replace("Ġ", "")
        score = token["score"]

        if current_entity is None:
            # Start a new entity
            current_entity = {
                "label": label,
                "text": word.strip(),
                "start": token["start"],
                "end": token["end"],
                "score": [score],
            }
        elif current_entity["label"] == label and token["start"] == current_entity["end"]:
            # Continue current entity (must be same label and consecutive)
            current_entity["text"] += word
            current_entity["end"] = token["end"]
            current_entity["score"].append(score)
        elif current_entity["label"] == label and token["start"] == (current_entity["end"] + 1):
            # Continue current entity (must be same label and consecutive)
            current_entity["text"] += " " + word
            current_entity["end"] = token["end"]
            current_entity["score"].append(score)
        else:
            # Save current and start new
            entities.append(current_entity)
            current_entity = {
                "label": label,
                "text": word.strip(),
                "start": token["start"],
                "end": token["end"],
                "score": [score],
            }

    if current_entity:
        entities.append(current_entity)

    # Average the scores
    for ent in entities:
        ent["score"] = sum(ent["score"]) / len(ent["score"])

    return entities

def process_ner_results(ner_results):
    # tag2label = {
    #     "O": 0,
    #     "B-DISEASE": 1,
    #     "I-DISEASE": 2,
    #     "B-MEDICATION": 3,
    #     "I-MEDICATION": 4,
    #     "B-PROCEDURE": 5,
    #     "I-PROCEDURE": 6,
    #     "B-SYMPTOM": 7,
    #     "I-SYMPTOM": 8,
    # }
    
    tag2label = {
            "O": 0,
            "DISEASE": 1,
            "MEDICATION": 2,
            "PROCEDURE": 3,
            "SYMPTOM": 4,
        }

    label2tag = {v: k for k, v in tag2label.items()}

    proc_results = []
    for result in ner_results:
        if result['entity'] != 'O':
            proc_results.append({
                'start': result['start'],
                'end': result['end'],
                'label': label2tag[int(result['entity'].split('_')[-1])],
                'score': result['score'],
                'word': result['word'],
            })
            
    return proc_results

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import re

tokenizer = AutoTokenizer.from_pretrained("/gpfs/projects/bsc14/code/CardioNER/output/en/NOIOB_cpt_biomed_roberta_token/E120BS32LR2e5/hf")
model = AutoModelForTokenClassification.from_pretrained("/gpfs/projects/bsc14/code/CardioNER/output/en/NOIOB_cpt_biomed_roberta_token/E120BS32LR2e5/hf")

pipe = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    device=1,  # Use GPU if available
)

Device set to use cuda:1


In [3]:
example1 = """
A 74-year-old man underwent primary percutaneous coronary intervention (pPCI) for a completely occluded proximal left anterior descending artery. During the procedure, microvascular flow distal to the block could not be established (no-reflow phenomenon) and was managed with diltiazem and nicorandil. A follow-up echocardiogram performed on the next day revealed an intraparietal echolucent apicolateral neocavitation corresponding to an intramyocardial haematoma (IMH) along with a reduced ejection fraction (EF) of 30\% compared with 45\% at the time of intervention. At 1 month follow-up, he developed congestive heart failure with a further decline in EF to 25\%. Although the IMH was resolving, the left ventricular apex had ballooned out. At 3 months, he was rehospitalised with a storm of ventricular tachycardia, further deterioration in EF with an apical aneurysm, and eventually passed away.
"""

example2 = """
**Discharge Summary**

**Patient Information:**
- **Name:** [Patient Name]
- **Age:** 74 years
- **Gender:** Male

**Admission Date:** [Admission Date]  
**Discharge Date:** [Discharge Date]  
**Attending Physician:** [Physician Name]  

**Diagnosis:**
- Complete occlusion of proximal left anterior descending artery
- Intramyocardial hematoma (IMH)
- Congestive heart failure
- Ventricular tachycardia

**Procedure:**
The patient underwent primary percutaneous coronary intervention (pPCI) for a completely occluded proximal left anterior descending artery. During the procedure, the no-reflow phenomenon was encountered, and management included administration of diltiazem and nicorandil.

**Post-Procedure Findings:**
A follow-up echocardiogram conducted the day after the procedure indicated the presence of an intraparietal echolucent apicolateral neocavitation, consistent with an intramyocardial hematoma (IMH). The patient's ejection fraction (EF) was noted to be 30%, a decline from 45% recorded during the intervention.

**Follow-Up:**
At the one-month follow-up, the patient developed congestive heart failure, with further deterioration in EF to 25%.Although the IMH showed signs of resolution, there was notable ballooning of the left ventricular apex.

**Rehospitalization:**
At three months post-procedure, the patient was rehospitalized due to a storm of ventricular tachycardia and additional decline in EF, accompanied by the development of an apical aneurysm.

**Outcome:**
Despite medical interventions, the patient ultimately passed away.

**Recommendations:**
- Continued monitoring and management of cardiac function.
- Consideration for advanced heart failure therapies.

**Prepared by:** [Your Name]  
**Date:** [Date]
"""

In [4]:
import pandas as pd
import os
from src.data import files_to_df

SOURCE_PATH_ORIG = "data/1_original/txt"
SOURCE_PATH_GEN = "data/2_generated/2step_transformation_dt4h_GPT4omini/en"

def create_pairs(path_orig, path_gen):

    df_gen = files_to_df(path_gen)
    df_gen["filenameid"] = df_gen["filenameid"].str.replace("_transformed_step1", "")

    df_orig = files_to_df(path_orig)
    df_pairs = df_orig.merge(df_gen, on="filenameid", suffixes=("_orig", "_gen"))
    # assert len(df_pairs) == N_EXPECTED_SAMPLES, f"Expected {N_EXPECTED_SAMPLES} samples, got {len(df_pairs)}"

    df_pairs.rename(columns={"text_orig": "clinical_case", "text_gen": "discharge_summary"}, inplace=True)
    return df_pairs

df_pairs = create_pairs(SOURCE_PATH_ORIG, SOURCE_PATH_GEN)

In [5]:
ner_results_1 = pipe(example1)
ner_results_2 = pipe(example2)

proc_results_1 = process_ner_results(ner_results_1)
proc_results_2 = process_ner_results(ner_results_2)

results_1 = assemble_ner_output_flat(proc_results_1)
results_2 = assemble_ner_output_flat(proc_results_2)

In [6]:
results_1

[{'label': 'PROCEDURE',
  'text': 'primary percutaneous coronary intervention',
  'start': 29,
  'end': 71,
  'score': 0.988441675901413},
 {'label': 'PROCEDURE',
  'text': 'pPCI',
  'start': 73,
  'end': 77,
  'score': 0.9450855453809103},
 {'label': 'PROCEDURE',
  'text': 'for',
  'start': 79,
  'end': 82,
  'score': 0.5009878873825073},
 {'label': 'DISEASE',
  'text': 'a completely occluded proximal left anterior descending artery',
  'start': 83,
  'end': 145,
  'score': 0.8819515976038846},
 {'label': 'MEDICATION',
  'text': 'diltiazem',
  'start': 277,
  'end': 286,
  'score': 0.998613715171814},
 {'label': 'MEDICATION',
  'text': 'nicorandil',
  'start': 291,
  'end': 301,
  'score': 0.9987502992153168},
 {'label': 'PROCEDURE',
  'text': 'echocardiogram',
  'start': 315,
  'end': 329,
  'score': 0.9991068482398987},
 {'label': 'DISEASE',
  'text': 'intramyocardial haematoma',
  'start': 440,
  'end': 465,
  'score': 0.9863511323928833},
 {'label': 'DISEASE',
  'text': 'IMH',
  '

In [7]:
results_2

[{'label': 'DISEASE',
  'text': 'Complete occlusion of proximal left anterior descending artery',
  'start': 256,
  'end': 318,
  'score': 0.93415536663749},
 {'label': 'DISEASE',
  'text': 'Intramyocardial hematoma',
  'start': 321,
  'end': 345,
  'score': 0.996253527700901},
 {'label': 'DISEASE',
  'text': 'IMH',
  'start': 347,
  'end': 350,
  'score': 0.85341015458107},
 {'label': 'DISEASE',
  'text': 'Congestive heart failure',
  'start': 354,
  'end': 378,
  'score': 0.997419810295105},
 {'label': 'DISEASE',
  'text': 'Ventricular tachycardia',
  'start': 381,
  'end': 404,
  'score': 0.9972559114297231},
 {'label': 'PROCEDURE',
  'text': 'primary percutaneous coronary intervention',
  'start': 443,
  'end': 485,
  'score': 0.9841527839501699},
 {'label': 'PROCEDURE',
  'text': 'pPCI',
  'start': 487,
  'end': 491,
  'score': 0.9745802283287048},
 {'label': 'PROCEDURE',
  'text': 'for',
  'start': 493,
  'end': 496,
  'score': 0.7383769154548645},
 {'label': 'DISEASE',
  'text':

In [8]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

def ner_outputs(text):
    ner_results_1 = pipe(text)

    proc_results_1 = process_ner_results(ner_results_1)
    results_1 = assemble_ner_output_flat(proc_results_1)

    df_res_1 = pd.DataFrame(results_1)[["label", "text"]].drop_duplicates()
    return df_res_1

def ner_output_dataset(df, col_text_name, fileid_col="filenameid"):
    ls_cc_ents = []
    for _, row in tqdm(df.iterrows(), total=len(df_pairs)):
        text = row[col_text_name]
        filenameid = row[fileid_col]
        df_res_1 = ner_outputs(text)
        df_res_1["filenameid"] = filenameid
        ls_cc_ents.append(df_res_1)
    
    return pd.concat(ls_cc_ents, ignore_index=True)
        

df_cc_ents = ner_output_dataset(df_pairs, "clinical_case")
df_ds_ents = ner_output_dataset(df_pairs, "discharge_summary")

  1%|█                                                                                                                                                      | 7/1000 [00:00<00:15, 63.69it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:15<00:00, 65.55it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:15<00:00, 64.47it/s]


In [9]:
ents_dist_cc = df_cc_ents.label.value_counts()
print(ents_dist_cc)
print("TOTAL:", ents_dist_cc.sum())

label
PROCEDURE     11338
DISEASE       10090
SYMPTOM        9889
MEDICATION     2135
Name: count, dtype: int64
TOTAL: 33452


In [10]:
ents_dist_ds = df_ds_ents.label.value_counts()
print(ents_dist_ds)
print("TOTAL:", ents_dist_ds.sum())

label
PROCEDURE     10790
DISEASE       10253
SYMPTOM        9255
MEDICATION     1982
Name: count, dtype: int64
TOTAL: 32280


In [11]:
df_cc_ents[df_cc_ents["label"] == "DISEASE"].text.value_counts().sort_values(ascending=False)

text
heart failure                        227
hypertension                         169
cardiomegaly                         110
atrial fibrillation                   85
cardiogenic shock                     65
                                    ... 
-se                                    1
coronary steal syndrome from           1
nephritic syndrome                     1
generalised tonic-clonic seizures      1
Severe lice infestation                1
Name: count, Length: 5123, dtype: int64

In [12]:
from sentence_transformers import SentenceTransformer
from src.search_index import FaissIndex

model_st = SentenceTransformer("/gpfs/projects/bsc14/abecerr1/hub/models--cambridgeltl--SapBERT-from-PubMedBERT-fulltext-mean-token/snapshots/9f95c2e962719c70f25bf7a1f33bd8d9e9448750", device="cuda:1")

No sentence-transformers model found with name /gpfs/projects/bsc14/abecerr1/hub/models--cambridgeltl--SapBERT-from-PubMedBERT-fulltext-mean-token/snapshots/9f95c2e962719c70f25bf7a1f33bd8d9e9448750. Creating a new one with mean pooling.


In [13]:
d_gazetteers = {
    "PROCEDURE": "data/4_gazetteers/en/english_procedures_gazetteer.tsv",
    "DISEASE": "data/4_gazetteers/en/english_diseases_gazetteer.tsv",
    "MEDICATION": "data/4_gazetteers/en/english_medications_gazetteer.tsv",
    "SYMPTOM": "data/4_gazetteers/en/english_symptoms_gazetteer.tsv",
}

def link_ents_to_gazetteers(df, ls_label, model):

    ls_out = []
    for label in ls_label:
        print("LABEL:", label)
        df_label = df[df.label == label].copy()
        faiss_index = FaissIndex(model=model_st, gazetteer_path=d_gazetteers[label], random_seed=0)
        faiss_index.generate_search_index()
        query_embs_1 = model_st.encode(df_label["text"].tolist())
        _, I1 = faiss_index.search(query_embs_1, k=10)
        
        ls_codes = [[faiss_index.get_code_by_index(i) for i in row] for row in I1]
        ls_terms = [[faiss_index.get_term_by_index(i) for i in row] for row in I1]
        
        df_label["codes"] = ls_codes
        df_label["terms"] = ls_terms
        
        ls_out.append(df_label)
    
    return pd.concat(ls_out, ignore_index=True)

df_cc_ents_gazetteers = link_ents_to_gazetteers(df_cc_ents, ["DISEASE", "MEDICATION", "PROCEDURE", "SYMPTOM"], model_st)

        

LABEL: DISEASE


Batches:   0%|          | 0/5892 [00:00<?, ?it/s]

LABEL: MEDICATION


Batches:   0%|          | 0/3232 [00:00<?, ?it/s]

LABEL: PROCEDURE


Batches:   0%|          | 0/7141 [00:00<?, ?it/s]

LABEL: SYMPTOM


Batches:   0%|          | 0/7099 [00:00<?, ?it/s]

In [14]:
df_cc_ents_gazetteers

Unnamed: 0,label,text,filenameid,codes,terms
0,DISEASE,Amyloid light-chain amyloidosis,33175723_1,"[23132008, 23132008, 23132008, 426598005, 2749...","[Amyloid light-chain amyloidosis, Primary amyl..."
1,DISEASE,cardiac amyloidosis,33175723_1,"[16573007, 16573007, 17602002, 1187540008, 118...","[Cardiac amyloidosis, Senile cardiac amyloidos..."
2,DISEASE,cardiomyopathy,33175723_1,"[85898001, 35728003, 111285003, 89461002, 3990...","[Cardiomyopathy, Familial cardiomyopathy, Meta..."
3,DISEASE,heart failure,33175723_1,"[84114007, 84114007, 84114007, 42343007, 42343...","[Heart failure, Cardiac failure, HF - Heart fa..."
4,DISEASE,primary AL amyloidosis,33175723_1,"[23132008, 128817004, 190923000, 274945004, 56...","[AL amyloidosis, Primary amyloidosis, Sporadic..."
...,...,...,...,...,...
33447,SYMPTOM,Pul,37861254,"[233604007, 19829001, 205237003, 91434003, 846...","[Pneumonia, Pulmonary disease, Pneumonitis, Pu..."
33448,SYMPTOM,test and,37861254,"[252014003, 226219004, 68193004, 251644002, 25...","[Urethral test observation, Test diet, Thomas ..."
33449,SYMPTOM,results were normal,37861254,"[168500000, 408573005, 165324008, 312969002, 1...","[Radiology result normal, Imaging result norma..."
33450,SYMPTOM,enlarged main pulmonary arteries,37861254,"[93059006, 251047005, 93059006, 194892009, 194...","[Pulmonary artery dilatation, Dilatation of pu..."


In [15]:
df_ds_ents_gazetteers = link_ents_to_gazetteers(df_ds_ents, ["DISEASE", "MEDICATION", "PROCEDURE", "SYMPTOM"], model_st)


LABEL: DISEASE


Batches:   0%|          | 0/5892 [00:00<?, ?it/s]

LABEL: MEDICATION


Batches:   0%|          | 0/3232 [00:00<?, ?it/s]

LABEL: PROCEDURE


Batches:   0%|          | 0/7141 [00:00<?, ?it/s]

LABEL: SYMPTOM


Batches:   0%|          | 0/7099 [00:00<?, ?it/s]

In [16]:
df_ds_ents_gazetteers

Unnamed: 0,label,text,filenameid,codes,terms
0,DISEASE,Amyloid light-chain amyloidosis,33175723_1,"[23132008, 23132008, 23132008, 426598005, 2749...","[Amyloid light-chain amyloidosis, Primary amyl..."
1,DISEASE,Cardiac amyloidosis,33175723_1,"[16573007, 16573007, 17602002, 1187540008, 118...","[Cardiac amyloidosis, Senile cardiac amyloidos..."
2,DISEASE,Cardiomyopathy,33175723_1,"[85898001, 35728003, 111285003, 89461002, 3990...","[Cardiomyopathy, Familial cardiomyopathy, Meta..."
3,DISEASE,Heart failure,33175723_1,"[84114007, 84114007, 84114007, 42343007, 42343...","[Heart failure, Cardiac failure, HF - Heart fa..."
4,DISEASE,Primary AL amyloidosis,33175723_1,"[23132008, 128817004, 190923000, 274945004, 56...","[AL amyloidosis, Primary amyloidosis, Sporadic..."
...,...,...,...,...,...
32275,SYMPTOM,and right ventricle,37861254,"[301097002, 109425008, 448600002, 448103004, 2...","[Finding of right ventricle, Single right vent..."
32276,SYMPTOM,increased right ventricular systolic pressure,37861254,"[416158002, 461321006, 18050000, 56218007, 180...","[Right ventricular systolic dysfunction, Right..."
32277,SYMPTOM,: Normal,37861254,"[386549008, 225544001, 81323004, 125112009, 53...","[Normal appearance, Skin normal, Normal patien..."
32278,SYMPTOM,Enlarged main pulmonary arteries,37861254,"[93059006, 251047005, 93059006, 194892009, 194...","[Pulmonary artery dilatation, Dilatation of pu..."


In [17]:
df_cc_ents_gazetteers.to_csv("nbs/evaluation/automatic/cardioner_entities/NOIOB_EN_clinical_case_ents.tsv", sep="\t", index=False)
df_ds_ents_gazetteers.to_csv("nbs/evaluation/automatic/cardioner_entities/NOIOB_EN_discharge_summary_ents.tsv", sep="\t", index=False)