## analyze the output of the results

In [1]:
import os
os.environ['CUDA_DEVICE_ORDER'] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "3"

In [2]:
import numpy as np
import pandas as pd
import torch
from utils import *

config = load_config()
PROJECT_PATH = config.project_path
DATA_PATH = PROJECT_PATH.joinpath('data/processed')

In [3]:
# set model path

MISTRAL7B = config.model_path('mistral7b')
BIOMISTRAL7B = config.model_path('biomistral7b')
MISTRAL7B_FINETUNED = config.model_path('mistral7b_finetuned')
BIOMISTRAL7B_FINETUNED = config.model_path('biomistral7b_avigon_finetuned')
MISTRAL7B_MIMIC_FINETUNED = config.model_path('mistral7b_mimic_finetuned')
BIOMISTRAL7B_MIMIC_FINETUNED = config.model_path('biomistral7b_mimic_finetuned')


In [4]:
# get example 

cv5, top10_dataset, filtered_notes = pd.read_pickle(DATA_PATH.joinpath("cv_processed_ranking_datasets.pkl"))

In [7]:
# need to filter out only test dataset that were not in the train set
# 1. let's filter out the test dataset

wholedataset = set(top10_dataset.fileid.unique().tolist())
trainset = set(cv5[0])
testset = list(wholedataset - trainset)

In [15]:
import random
random.seed(1)

sample_files = random.sample(testset, 3)
print(sample_files)

['diabetes.report336006.txt', 'heart_failure.report51414.txt', 'liver_failure.report103978.txt']


In [21]:
# load prompts and format data
from datasets import Dataset

def process_texts(samples, template) :

    texts = samples['text']
    formated_texts = []
    for text in texts :
        new_text = template.format(
            context = text
        )
        formated_texts.append(new_text)
    
    return {"questions" : formated_texts}

In [22]:
# process dataset with questions

dataset = Dataset.from_pandas(filtered_notes)
template = config.template("top5", "zeroshot")
dataset = dataset.map(process_texts, batched=True, fn_kwargs ={"template" : template})
dataset = dataset.to_pandas()

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

In [None]:
# return text for input for model
def return_text(sample_file) :
    text = dataset[dataset.noteid == sample_file]['questions'].values.tolist()[0]
    return text

# process ranking
def process_ranking(sample_file) :
    ranking = top10_dataset[top10_dataset.fileid == sample_file]['ranking']
    return ranking

# process notes
def process_notes(sample_file) :
    notes = filtered_notes[filtered_notes.noteid == sample_file]['text']
    notes = notes.values.tolist()[0]
    return notes

In [24]:
# process gold label data
def process_gold_label(sample_file) :
    temp = top10_dataset[top10_dataset.fileid == sample_file][['phrase', 'ranking']].sort_values(by="ranking")
    gold_answer = ""
    for _, row in temp.iterrows() :
        gold_answer += str(row['ranking']) + ' ' + row['phrase'] + '\n'
    return gold_answer

In [11]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from peft.peft_model import PeftModel

DEFAULT_PAD_TOKEN = "<pad>"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"

def load_model(model_path, finetune_flag=False):

    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
    if finetune_flag :
        model = PeftModel.from_pretrained(model, model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    tokenizer.add_special_tokens({
                "eos_token": DEFAULT_EOS_TOKEN,
                "bos_token": DEFAULT_BOS_TOKEN,
                "unk_token": DEFAULT_UNK_TOKEN,
            })
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer


def infer_answers(text, model, tokenizer) :
    input_ids = tokenizer(text, return_tensors="pt").input_ids.cuda()
    print("length of input ids are : ", len(input_ids))
    output_ids = model.generate(input_ids=input_ids, 
                                max_new_tokens = 200)
    
    arr_output = output_ids.detach().cpu().numpy()
    start_of_generate_index = input_ids.shape[1]
    pred_output = tokenizer.batch_decode(arr_output[:, start_of_generate_index:], skip_special_tokens=True)[0]

    return pred_output


In [12]:
# test inference

model, tokenizer = load_model(BIOMISTRAL7B, False)
answers = infer_answers(text=text, model=model, tokenizer=tokenizer)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


length of input ids are :  1


In [13]:
print(answers)

1. Nonischemic cardiomyopathy
1.1 Persantine thallium stress test
2. GERD
2.1 EGD
3. Fatty liver
3.1 Hepatitis B and C serologies
3.2 Ferritin
3.3 Iron saturation
3.4 Hepatitis B and C serologies
3.5 Ceruloplasmin
4. Hives
4.1 EpiPen
5. Sinus polyposis
5.1 Polypectomy
6. Asthma
6.1 Dr. name
7. Aspirin allergy
7.1 Aspirin suppression
8. Knee pain
8.1 ER visit
8.2 X-rays
9. Left bicipital tendon tear
9.1 Dent or bulge
10. Colorectal cancer screening
11.


In [14]:
# test inference

model, tokenizer = load_model(BIOMISTRAL7B_FINETUNED, False)
answers = infer_answers(text=text, model=model, tokenizer=tokenizer)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


length of input ids are :  1


In [15]:
print(answers)

1.0 nonischemic cardiomyopathy
1.1 beta-blockers
1.2 ace inhibitors
2.0 hypertensive
2.1 lipidation
3.0 hepatitis
3.1 ferritin
3.2 iron saturation
4.0 hepocromopathy
5.0 hepatitis b and c serologies



In [12]:
# test inference

model, tokenizer = load_model(BIOMISTRAL7B_MIMIC_FINETUNED, False)
answers = infer_answers(text=text, model=model, tokenizer=tokenizer)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


length of input ids are :  1


In [13]:
print(answers)

1. Nonischemic cardiomyopathy
1.1. Persantine thallium test
1.2. Echocardiogram
2. GERD
2.1. Proton pump inhibitor (PPI) therapy
2.2. Endoscopy
3. Fatty liver
3.1. Iron saturation
3.2. Ferritin levels
4. Hives
4.1. Antihistamines
5. Sinus polyposis
5.1. Nasal steroids
6. Asthma
6.1. Inhaled corticosteroids (ICS)
7. Aspirin allergy
7.1. Aspirin suppression
8. Knee pain
8.1. X-rays
8.2. MRI of the knee
9. Bicipital tendon tear
9.1. Physical therapy (PT)

