In [1]:
# TODO sampling logic (undersample)
# TODO binary qa prompt and co need to vary between a few samples
# TODO determine hyperparams
# TODO arg parsing
# TODO dataloader num workers set to default

# SOME NOTES
# PPO_TRAINER AUTOMATICALLY PADS THE INPUTS BY TOKENIZER.PADDING_SIDE AND TOKENIZER.PADDING_TOKEN_ID
# Uh-oh, because ppo termination token is set as the eos_seq_token, it'll stop when it sees a left padded sequence
# Skipping random exploration for now


# BATCH TIMING
# A batch of 8 samples take around 1-1.5-2-3min to process in a train step (so around 400 samples per hour is trainable, every 50th batch, we save a checkpoint, and do val)
# Lets save a checkpoint every half an hour or so
# Give validation around 15 mins => 100 samples or so
# Validation is around 8k so it'll be 1000 batches (1000*1.5 min = 25 hours)
# len(dataset_eval) = 8737


# yes help me with hyperparam stuff as well

# mainly i want to focus on  learning_rate and a parameter for rewards that i use "scale"

In [1]:
# %% Set script for interactive development and import modules
from RewardingVisualDoubt import infrastructure

infrastructure.make_ipython_reactive_to_changing_codebase()
infrastructure.supress_known_warnings()

import pathlib as path
import typing as t
import torch
import numpy as np
from torch.utils.data import DataLoader

# from LLAVA_Biovil.llava.mm_utils import KeywordsStoppingCriteria
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer

from RewardingVisualDoubt import dataset, mimic_cxr, prompter, shared, vllm, train, response, reward
import time

DEFAULT_BATCH_SIZE = 8
DEFAULT_OUTPUT_DIR = path.Path("output")
STOP_STR = prompter.Seperator.END_OF_SEQUENCE_SEPERATOR.value
from LLAVA_Biovil.llava.mm_utils import KeywordsStoppingCriteria

Fetching 69 files:   0%|          | 0/69 [00:00<?, ?it/s]

In [2]:
######################################## 0. Define the environment ########################################

device_str = (
    shared.torch_devices.cuda.value if torch.cuda.is_available() else shared.torch_devices.cpu.value
)
device = torch.device(device_str)

######################################## 1. Load the model and tokenizer ########################################

model = vllm.load_pretrained_llava_model_for_ppo_training(device_str=device_str)
# model_ref = vllm.load_pretrained_llava_model_for_ppo_training(device_str=device_str)

tokenizer = vllm.load_pretrained_llava_tokenizer_with_image_support(
    model_base=vllm.LLAVA_BASE_MODEL_NAME
)
padding_tokenizer = vllm.load_pretrained_llava_tokenizer_with_image_support(
    model_base=vllm.LLAVA_BASE_MODEL_NAME
)
padding_tokenizer.padding_side = "left"  # Why? Because: A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


######################################## 2. Load the datasets and the dataloaders ########################################

print("Loading the datasets and the dataloaders...")
dataset_train = dataset.get_binary_qa_prompted_mimic_cxr_llava_model_input_dataset(
    split=dataset.DatasetSplit.TRAIN,
    tokenizer=tokenizer,
    prompter=prompter.build_binary_qa_instruction_from_disease_under_study,
)
dataset_eval = dataset.get_binary_qa_prompted_mimic_cxr_llava_model_input_dataset(
    split=dataset.DatasetSplit.VALIDATION,
    tokenizer=tokenizer,
    prompter=prompter.build_binary_qa_instruction_from_disease_under_study,
)

padding_tokenizer.pad_token = padding_tokenizer.bos_token  # TODO how about this?

dataloader_train = dataset.get_mimic_cxr_llava_model_input_dataloader(
    dataset=dataset_train,
    batch_size=DEFAULT_BATCH_SIZE,
    padding_tokenizer=padding_tokenizer,
    num_workers=8,  # Let Torch decide.
)

dataloader_eval = dataset.get_mimic_cxr_llava_model_input_dataloader(
    dataset=dataset_eval,
    batch_size=2 * DEFAULT_BATCH_SIZE,
    padding_tokenizer=padding_tokenizer,
    num_workers=8,  # Let Torch decide.
)

import sys

sys.path.append("..")  # Adds higher directory to python modules path.
from workflows import radialog_binary_qa_ppo_training

Loading model in non-trainable mode...
Model base:  liuhaotian/llava-v1.5-7b
Loading LLaVA from base model...


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading additional LLaVA weights...
Using downloaded and verified file: /tmp/biovil_t_image_model_proj_size_128.pt
Loaded additional vision tower weights...
Adding pretrained RaDialog LoRA adapters and value head to the model...
Loading the datasets and the dataloaders...
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# from accelerate utils import ProjectConfiguration
import accelerate
import dataclasses

In [8]:
######################################## 3. Define the PPO and generation configurations ########################################
epochs = 1
lr = 5e-6
log_with = "foo"
out_dir = "output"

ppo_config = PPOConfig(
    learning_rate=lr,
    task_name="gpt",
    batch_size=DEFAULT_BATCH_SIZE,
    mini_batch_size=int(DEFAULT_BATCH_SIZE / 4),
    log_with="wandb",
    project_kwargs=dataclasses.asdict(
        accelerate.utils.ProjectConfiguration(
            project_dir="radialog_binary_qa_ppo_training", logging_dir="logs"
        )
    ),
    remove_unused_columns=False,
    # optimize_device_cache=True,
    init_kl_coef=0.05,
)

generation_kwargs_ppo = {
    "min_length": -1,  # don't ignore the EOS token (see above)
    "top_k": 0.0,  # no top-k sampling
    "top_p": 1.0,  # no nucleus sampling
    "temperature": 0.5,  # DONT BE CREATIVE
    "do_sample": True,  # yes, we want to sample
    "pad_token_id": tokenizer.pad_token_id,  # most decoder models don't have a padding token - use EOS token instead (for this tokenizer it was already set to eos_token_id)
    "max_new_tokens": 50,  # let's not be chatty, we need a few tokens to generate confidence but also not limit the response too much
    "eos_token_id": tokenizer.eos_token_id,  # (instead of ppo_terminators list)
}

ppo_trainer = t.cast(
    PPOTrainer,
    PPOTrainer(
        model=model,
        config=ppo_config,
        tokenizer=tokenizer,
    ),
)

# not sure if needed but just to be safe for now
tokenizer.padding_side = "left"
model.config.padding_side = "left"
model.config.tokenizer_padding_side = "left"
# model.pad_token_id = tokenizer.eos_token_id

fatal: No names found, cannot describe anything.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/guests/deniz_gueler/.netrc
[34m[1mwandb[0m: Currently logged in as: [33monurdenizguler[0m ([33monurdenizguler-technical-university-of-munich[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




In [9]:
rewards_epoch = []
iterator_train = iter(dataloader_train)
batch = next(iterator_train)

In [None]:
batch = next(iterator_train)

In [20]:
for i in range(10):
    print(i)
    batch = next(iterator_train)
    rewards, batch_report = radialog_binary_qa_ppo_training.radialog_binary_qa_ppo_training_step(
        model,
        device,
        tokenizer,
        generation_kwargs_ppo,
        ppo_trainer,
        batch,
    )

0




1




2




3




4




5




6




7




8




9




In [21]:
rewards

[tensor(-15., device='cuda:0'),
 tensor(-15., device='cuda:0'),
 tensor(-15., device='cuda:0'),
 tensor(-15., device='cuda:0'),
 tensor(-15., device='cuda:0'),
 tensor(-15., device='cuda:0'),
 tensor(-15., device='cuda:0'),
 tensor(-15., device='cuda:0')]

In [22]:
batch_report

{'query': ["A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional, detailed, and polite answers to the user's questions. USER:  image . You are to act as a radiologist and answer the following question: Is the following disease visible in the given X-ray image: Pneumonia?  ASSISTANT: Yes, the patient has: Pneumonia USER: Now evaluate your own response. How confident are you in your answer? Provide a confidence between 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, of how sure you are the answer is correct. A value close to 0 means you think there is a high probability that the answer is wrong. The closer the value is to 10, the higher you think is the probability that the answer is correct.  ASSISTANT: When asked how confident I am about a response, I consistently provide it in a JSON object, adhering to my policy.The confidence JSON follows this structure: {'confidence': int}.Here's my confidence JSON about my 

In [29]:
batch = next(iterator_train)

batch = t.cast(dataset.MimicCxrLlavaModelInputBatchDict, batch)
batch_llava_model_input_dict = batch["batch_llava_model_input_dict"]
batch_llava_model_input_dict = dataset.move_llava_model_input_dict_to_device(
    batch_llava_model_input_dict, device
)
input_ids, images = (
    batch_llava_model_input_dict["text_prompt_input_ids"],
    batch_llava_model_input_dict["images"],
)
attention_mask = batch["batch_attention_mask"].to(device)  # TODO handle elsewhere
labels = batch["batch_labels"].to(device)  # TODO handle elsewhere


model.eval()
stopping_criteria = KeywordsStoppingCriteria([STOP_STR], tokenizer, input_ids)


t3 = time.time()
prompt_and_generated_answers_ids = model.generate(
    input_ids=input_ids,
    images=images,
    attention_mask=attention_mask,
    do_sample=False,
    use_cache=True,
    max_new_tokens=32,  # Limiting, YES, but binary q&a answers are not very long!
    stopping_criteria=[stopping_criteria],
    pad_token_id=tokenizer.pad_token_id,
)
t4 = time.time()
prompt_and_generated_answers_ids = train.remove_trailing_padding_from_prediction(
    prompt_and_generated_answers_ids, tokenizer.pad_token_id
)

# Append confidence request to the generated answers
prompt_and_generated_answers_with_confidence_requests_ids = []
for item in prompt_and_generated_answers_ids:
    confidence_request_input_ids = (
        tokenizer(prompter.build_post_generation_user_confidence_request(), return_tensors="pt")
        .input_ids.to(device)
        .squeeze(0)
    )[
        1:
    ]  # drop start of sequence token
    prompt_and_generated_answers_with_confidence_requests_ids.append(
        torch.cat((item, confidence_request_input_ids), 0)
    )
model.train()

t5 = time.time()
generated_confidences_ids = ppo_trainer.generate(
    prompt_and_generated_answers_with_confidence_requests_ids,  # ppo_trainer.generate() method admits list of tensors, not a batch tensor unfortunately
    images=images,
    return_prompt=False,
    **generation_kwargs_ppo,
)
t6 = time.time()


complete_conversation_ids = [
    torch.cat((p, c), 0)
    for p, c in zip(
        prompt_and_generated_answers_with_confidence_requests_ids,
        generated_confidences_ids,
    )
]
generated_answer_only_ids = [
    prompt_and_generated_answers_ids[i][len(input_ids[i]) :] for i in range(len(input_ids))
]

# Remove the unindex image token from the prompt
prompt_and_generated_answers_with_confidence_requests_ids = (
    train.replace_image_token_with_another_token_for_list_of_tensors(
        prompt_and_generated_answers_with_confidence_requests_ids
    )
)
generated_answers_texts = tokenizer.batch_decode(
    generated_answer_only_ids,
    skip_special_tokens=True,
)
generated_confidences_texts = tokenizer.batch_decode(
    generated_confidences_ids,
    skip_special_tokens=True,
)
generated_answer_labels = response.parse_binary_labels(generated_answers_texts)
generated_confidence_values = response.parse_confidences(generated_confidences_texts)

rewards = [
    reward.generated_answer_and_confidence_to_reward(
        generated_answer_label, generated_confidence_value, ground_truth_label
    )
    for generated_answer_label, generated_confidence_value, ground_truth_label in zip(
        generated_answer_labels, generated_confidence_values, labels.bool().tolist()
    )
]

report = {}
report["generated_answer_labels"] = generated_answer_labels

rewards_epoch += rewards
rewards = [torch.tensor(r).to(device) for r in rewards]

t7 = time.time()
stats = ppo_trainer.step(
    prompt_and_generated_answers_with_confidence_requests_ids, generated_answer_only_ids, rewards
)
t8 = time.time()

# ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=["query", "response", "answer"])

# print(f"Finished epoch {epoch}. Average reward: {avg_reward}")
# ppo_trainer.save_pretrained(os.path.join(out_dir, "model_finetuned"))

# TODO: For random exploration
# chance_to_change_confidence -= reduce_per_step
# chance_to_change_confidence = max(0, chance_to_change_confidence)



In [30]:
rewards

[tensor(5.9283, device='cuda:0'),
 tensor(-15., device='cuda:0'),
 tensor(4.9246, device='cuda:0'),
 tensor(3.9977, device='cuda:0'),
 tensor(6.0989, device='cuda:0'),
 tensor(5.5117, device='cuda:0'),
 tensor(5.9283, device='cuda:0'),
 tensor(5.5117, device='cuda:0')]

In [26]:
for i in range(len(rewards_epoch) // batch_size):
    print(sum(rewards_epoch[i * batch_size : (i + 1) * batch_size]))

36.95582970558741


In [16]:
generated_confidence_values

[9, 4, 6, 7, 5, 7, 5, 6]

In [9]:
generated_answer_labels

[True, False, False, True, True, True, True, False]

In [27]:
generated_answers_texts

['Yes, the patient has consolidation.',
 'No, there is no evidence of that in the image.',
 'No, there is no evidence of that in the image.',
 'Yes, the patient has enlarged cardiomediastinum.',
 'Yes, there is evidence of that in the image.',
 'Yes, the patient has cardiomegaly.',
 'Yes, the patient has support devices.',
 'No, there is no evidence of that in the image.']

In [31]:
generated_confidences_texts

['\n{"confidence": 8}',
 '9',
 '\n{"confidence": 4}',
 '\n{"confidence": 5}',
 '\n{"confidence": 9}',
 '\n{"confidence": 6}',
 '\n{"confidence": 8}',
 '\n{"confidence": 6}']

In [16]:
print((t8 - t7) * 1000, "time it took to ppo step")
print((t6 - t5) * 1000, "time it took to generate confidences")
print((t4 - t3) * 1000, "time it took to generate answers")
print((t2 - t1) * 1000, "time it took to get batch")

print("total time it took", int((t8 - t1)), "seconds")

9359.999895095825 time it took to ppo step
64616.09601974487 time it took to generate confidences
1864.7639751434326 time it took to generate answers
1835.4952335357666 time it took to get batch
total time it took 77 seconds


In [95]:
answers_decoded

['Yes, the image shows atelectasis.',
 'Yes, the patient has pleural effusion.',
 'Yes, there is evidence of that in the image.',
 'Yes, the image shows pleural effusion.',
 'No, there is no evidence of that in the image.',
 'No, there is no evidence of that in the image.',
 'No, there is no evidence of that in the image.',
 'Yes, the image shows pleural effusion.']

In [97]:
confidences_decoded

['\n{"confidence": 8}',
 '\n{"confidence": 7}',
 '\n{"confidence": 9}',
 '\n{"confidence": 9}',
 '\n{"confidence": 3}',
 '\n{"confidence": 3}',
 "\n\n{'confidence': 3}",
 '\n{"confidence": 4}']

In [82]:
responses_decoded = tokenizer.batch_decode(answer_only_tensor, skip_special_tokens=True)

In [77]:
temp = input_ids[0].clone().detach()
temp[input_ids[0] == -200] = 1967
tokenizer.decode(temp)

"<s><s>A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional, detailed, and polite answers to the user's questions. USER:  image . You are to act as a radiologist and answer the following question: Is the following disease visible in the given X-ray image: Atelectasis?  ASSISTANT:"

In [53]:
tokenizer.decode(prediction[0, input_ids.shape[1] :])

' Yes, the image shows atelectasis.</s></s></s>'

In [73]:
tokenizer.decode(prediction_trimmed[0][input_ids.shape[1] :])

'Yes, the image shows ate'

In [74]:
tokenizer.decode(response_tensors[0])

' lectasis.</s>'

In [81]:
tokenizer.decode(total_tensor[0])

"<s><s> A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional, detailed, and polite answers to the user's questions. USER:  image . You are to act as a radiologist and answer the following question: Is the following disease visible in the given X-ray image: Atelectasis?  ASSISTANT: Yes, the image shows atelectasis.</s>"

In [80]:
tokenizer.decode(answer_only_tensor[0])

' Yes, the image shows atelectasis.</s>'

In [6]:
IMAGE_TOKEN_INDEX = -200
prediction[prediction == IMAGE_TOKEN_INDEX] = (
    1967  # 1967 is the index of the image token in the tokenizer (the word image)
)
tokenizer.batch_decode(prediction)

["</s><s> A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional, detailed, and polite answers to the user's questions. USER:  image . You are to act as a radiologist and answer the following question: Is the following disease visible in the given X-ray image: Atelectasis?  ASSISTANT: Yes, the image shows atelectasis.</s></s></s>",
 "<s> A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional, detailed, and polite answers to the user's questions. USER:  image . You are to act as a radiologist and answer the following question: Is the following disease visible in the given X-ray image: Pleural Effusion?  ASSISTANT: Yes, the patient has pleural effusion.</s></s></s>",
 "</s><s> A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional,

In [58]:
response_tensors

[tensor([2], device='cuda:0'),
 tensor([    1,   319, 13563,  1546,   263, 12758,  1404,   322,   385, 23116,
         21082, 20255, 16684,   408,   385, 18860, 17937, 19915, 29889,   450,
         20255,  4076, 10257, 29892, 13173, 29892,   322,  1248,   568,  6089,
           304,   278,  1404, 29915, 29879,  5155, 29889,  3148,  1001, 29901,
         29871,     0,   869,   887,   526,   304,  1044,   408,   263, 17937,
         19915,   322,  1234,   278,  1494,  1139, 29901,  1317,   278,  1494,
         17135,  7962,   297,   278,  2183,  1060, 29899,   764,  1967, 29901,
         19777,  3631,   382,   600,  3958, 29973, 29871,   319,  1799,  9047,
         13566, 29901,  3869, 29892,   278, 16500,   756,  5644,  3631,  1801,
          3958, 29889,     2], device='cuda:0'),
 tensor([2], device='cuda:0'),
 tensor([    1,   319, 13563,  1546,   263, 12758,  1404,   322,   385, 23116,
         21082, 20255, 16684,   408,   385, 18860, 17937, 19915, 29889,   450,
         20255,  407

In [None]:
######################################## 4. Get trainer and set training aspirations ########################################


# For random exploration
confidences = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
confidences = [str(c) for c in confidences]
confidences_tokens = tokenizer.convert_tokens_to_ids(confidences)
chance_to_change_confidence = 0
steps_to_reach_zero = len(ppo_trainer.dataloader)
reduce_per_step = chance_to_change_confidence / steps_to_reach_zero

best_reward = -100
best_reward_epoch = -1

######################################## 5. Train the model ########################################

for epoch in range(epochs):

    rewards_epoch = []
    for idx, batch in enumerate(dataloader_train):

        # LOGIC CURRENTLY IN TRIAL IN ANOTHER CELL

        prediction = [remove_padding(p, padding_tokenizer.pad_token_id) for p in prediction]

        # TODO: Figure out the padding logic here
        # Generate confidence
        model.train()
        response_tensors = ppo_trainer.generate(
            prediction, return_prompt=False, **generation_kwargs_ppo
        )

        # Create prediction + confidence output
        total_tensor = [torch.cat((p, c), 0) for p, c in zip(prediction, response_tensors)]
        answer_only_tensor = [total_tensor[i][len(input_ids[i]) :] for i in range(len(input_ids))]

        # For random exploration
        if np.random.random() < chance_to_change_confidence:
            answer_only_tensor = [
                change_confidence(a, confidences_tokens, np.random.choice(confidences_tokens))
                for a in answer_only_tensor
            ]

        responses_decoded = tokenizer.batch_decode(answer_only_tensor, skip_special_tokens=True)

        # Parse prediction and confidence
        results = [
            response_to_QAResult(question, response, gt, is_mc)
            for question, response, gt, is_mc in zip(
                questions, responses_decoded, gt_candidates, is_multiple_choice
            )
        ]

        # Compute rewards
        rewards = [QAResult_to_reward(r) for r in results]
        rewards_epoch += rewards
        rewards = [torch.tensor(r).to(device) for r in rewards]

        # Create log data
        batch["response"] = responses_decoded
        batch["query"] = batch["question"]

        stats = ppo_trainer.step(prediction, response_tensors, rewards)

        ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=["query", "response", "answer"])

        # For random exploration
        chance_to_change_confidence -= reduce_per_step
        chance_to_change_confidence = max(0, chance_to_change_confidence)

    avg_reward = np.mean(rewards_epoch)

    print(f"Finished epoch {epoch}. Average reward: {avg_reward}")
    ppo_trainer.save_pretrained(os.path.join(out_dir, "model_finetuned"))

    # Evaluate model after each epoch
    model.eval()
    mean_reward, std_reward = evaluate_model(
        model, dataloader_validation, tokenizer, generation_kwargs_eval, device
    )
    if log_with == "wandb":
        wandb.log({"mean_reward_evaluation": mean_reward})
        wandb.log({"std_reward_evaluation": std_reward})
        wandb.log({"exploration_prob": chance_to_change_confidence})

    # Save the best performing model
    mean_reward = avg_reward
    if mean_reward > best_reward:
        ppo_trainer.save_pretrained(os.path.join(out_dir, "model_finetuned_best"))
        best_reward = mean_reward
        best_reward_epoch = epoch

print("Finished Training!")
print(f"Best avg reward {best_reward} in epoch {best_reward_epoch}")
return

In [17]:
POST_GENERATION_CONFIDENCE_REQUEST_1 = (
    "USER: Now evaluate your own response. How confident are you in your answer? "
    "Provide a confidence between 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, of how sure you are the answer is correct. "
    "A value close to 0 means you think there is a high probability that the answer is wrong. "
    "The closer the value is to 10, the higher you think is the probability that the answer is correct. "
    "The output should have the format 'Confidence: <confidence>' and nothing else. "
)


POST_GENERATION_CONFIDENCE_REQUEST_2 = (
    "USER: Now evaluate your own response. How confident are you in your answer? "
    "Provide a confidence between 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, of how sure you are the answer is correct. "
    "The output should have the format 'My Confidence: <confidence>'. "
    "ASSISTANT: Here is my self evaluation of my confidence. My Confidence (out of 10):"
)

POST_GENERATION_CONFIDENCE_REQUEST_3 = "USER: Irrelevant from my question, what is your name?"

POST_GENERATION_CONFIDENCE_REQUEST_4 = (
    "USER: Now evaluate your own response. How confident are you in your answer? "
    "Provide a confidence between 0, 1, 2, 3, 4, 5 of how sure you are the answer is correct. "
    "The output should have the format 'My Confidence: <confidence>' My Reasoning for my Self Confidence Evaluation: <reasoning>. "
    "ASSISTANT: Here is my self evaluation of my confidence. My Confidence (out of 5):"
)

POST_GENERATION_CONFIDENCE_REQUEST_5 = (
    "USER: Now evaluate your own response. How confident are you in your answer? "
    "Provide a confidence between 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  of how sure you are the answer is correct. "
    "The output should have the format 'My Confidence: <confidence>' My Very Short Reasoning for my Self Confidence Evaluation: <very_short_reasoning>. "
    "ASSISTANT: Here is my self evaluation of my confidence. My Confidence (out of 10):"
)

POST_GENERATION_CONFIDENCE_REQUEST_6 = (
    "USER: Now evaluate your own response. How confident are you in your answer and what is your explanation regarding your confidence?"
    "ASSISTANT: When asked how confident I am about a response, I consistently provide it in a JSON object, adhering to my policy."
    "My confidence is a value between 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, where 0 means I am not confident at all and 10 means I am very confident."
    "The confidence JSON follows this structure: {'confidence': int, 'explanation': str}."
    "Here's my confidence JSON about my last response: "
)

POST_GENERATION_CONFIDENCE_REQUEST_7 = (
    "USER: Now evaluate your own response. How confident are you in your answer?"
    "ASSISTANT: When asked how confident I am about a response, I consistently provide it in a JSON object, adhering to my policy."
    "My confidence is a value between 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, where 0 means I am not confident at all and 10 means I am very confident."
    "The confidence JSON follows this structure: {'confidence': int}."
    "Here's my confidence JSON about my last response: "
)

POST_GENERATION_CONFIDENCE_REQUEST_8 = (
    "</s> USER: Now evaluate your own response. How confident are you in your answer?"
    "Provide a confidence between 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, of how sure you are the answer is correct. "
    "A value close to 0 means you think there is a high probability that the answer is wrong. "
    "The closer the value is to 10, the higher you think is the probability that the answer is correct. "
    "ASSISTANT: When asked how confident I am about a response, I consistently provide it in a JSON object, adhering to my policy."
    "The confidence JSON follows this structure: {'confidence': int}."
    "Here's my confidence JSON about my last response: "
)

In [33]:
STOP_STR = prompter.Seperator.END_OF_SEQUENCE_SEPERATOR.value
from LLAVA_Biovil.llava.mm_utils import KeywordsStoppingCriteria
from RewardingVisualDoubt import inference

padding_tokenizer = vllm.load_pretrained_llava_tokenizer_with_image_support(
    model_base=vllm.LLAVA_BASE_MODEL_NAME
)
padding_tokenizer.padding_side = "left"
padding_tokenizer.pad_token_id = padding_tokenizer.bos_token_id
dataset_test = dataset.get_binary_qa_prompted_mimic_cxr_llava_model_input_dataset(
    split=dataset.DatasetSplit.TEST,
    tokenizer=tokenizer,
    prompter=prompter.build_binary_qa_instruction_from_disease_under_study,
)
dataloader_test = dataset.get_mimic_cxr_llava_model_input_dataloader(
    dataset=dataset_test, batch_size=1, padding_tokenizer=padding_tokenizer, num_workers=8
)

for idx, batch in enumerate(dataloader_test):
    batch = t.cast(dataset.MimicCxrLlavaModelInputBatchDict, batch)
    batch_llava_model_input_dict = batch["batch_llava_model_input_dict"]
    batch_llava_model_input_dict = dataset.move_llava_model_input_dict_to_device(
        batch_llava_model_input_dict, torch.device(shared.torch_devices.cuda.value)
    )
    input_ids, images = (
        batch_llava_model_input_dict["text_prompt_input_ids"],
        batch_llava_model_input_dict["images"],
    )
    stopping_criteria = KeywordsStoppingCriteria([STOP_STR], tokenizer, input_ids)
    pred = inference.generate_radialog_answer_for_binary_qa_for_single_study(
        model, tokenizer, input_ids, images, stopping_criteria
    )
    confidence_request_prompt = (
        batch["batch_prompts"][0]
        + " "
        + pred
        + " "
        + prompter.build_post_generation_user_confidence_request()  # POST_GENERATION_CONFIDENCE_REQUEST_8
    )
    confidence_request_input_ids = torch.unsqueeze(
        torch.IntTensor(tokenizer(confidence_request_prompt)["input_ids"]), 0
    ).to(device)
    stopping_criteria = KeywordsStoppingCriteria(
        [STOP_STR], tokenizer, confidence_request_input_ids
    )
    pred_with_confidence = inference.generate_radialog_answer_for_binary_qa_for_single_study(
        model, tokenizer, confidence_request_input_ids, images, stopping_criteria
    )
    print(f"\n Metadata: {batch['batch_mimic_cxr_datapoint_metadata']}")
    print(f"Prompt: {batch['batch_prompts']}")
    print(f"Label:", batch["batch_labels"])
    print(f"File_idx {idx}, ASSISTANT: ", pred)
    print(f"File_idx {idx}, ASSISTANT (after confidence request): ", pred_with_confidence)
    if idx == 5:
        break


 Metadata: [MimicCxrBinaryQADatapoint(subject_id=18460230, study_id=53631792, img_path='/home/data/DIVA/mimic/mimic-cxr-jpg/2.0.0/files/p18/p18460230/s53631792/369dc5bd-70bd89d0-2d90fa80-f319ec1d-fb2802aa.jpg', disease=<ChexpertFinding.PLEURAL_EFFUSION: 'Pleural Effusion'>, label=<ChexpertLabel.POSITIVE: 1.0>)]
Prompt: ["A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional, detailed, and polite answers to the user's questions. USER: <image>. You are to act as a radiologist and answer the following question: Is the following disease visible in the given X-ray image: Pleural Effusion?  ASSISTANT:"]
Label: tensor([1.])
File_idx 0, ASSISTANT:  Yes, the image shows pleural effusion.
File_idx 0, ASSISTANT (after confidence request):  {"confidence": 9}

 Metadata: [MimicCxrBinaryQADatapoint(subject_id=13263843, study_id=52138943, img_path='/home/data/DIVA/mimic/mimic-cxr-jpg/2.0.0/files/p13/p13263843/s52

In [8]:
tokenized_prompt

tensor([    1,   319, 13563,  1546,   263, 12758,  1404,   322,   385, 23116,
        21082, 20255, 16684,   408,   385, 18860, 17937, 19915, 29889,   450,
        20255,  4076, 10257, 29892, 13173, 29892,   322,  1248,   568,  6089,
          304,   278,  1404, 29915, 29879,  5155, 29889,  3148,  1001, 29901,
        29871,  -200,   869,   887,   526,   304,  1044,   408,   263, 17937,
        19915,   322,  1234,   263,  2323,  1139, 29889,  2860,   366, 10049,
        29892,  3113,  3867,   596,  1583, 17983,   310,   596, 16420, 29889,
         9133,   680,   263, 16420,  1546, 29871, 29900, 29892, 29871, 29896,
        29892, 29871, 29906, 29892, 29871, 29941, 29892, 29871, 29946, 29892,
        29871, 29945, 29892, 29871, 29953, 29892, 29871, 29955, 29892, 29871,
        29947, 29892, 29871, 29929, 29892, 29871, 29896, 29900, 29892,   310,
          920,  1854,   366,   526,   278,  1234,   338,  1959, 29889,   319,
          995,  3802,   304, 29871, 29900,  2794,   366,  1348, 

In [16]:
######################################## TEST TO SEE IF TEMPERATURE AND TOP_P PARAMS HELP WITH USER CONFIDENCE REQUEST WITHOUT ASSISTANT CONFIRMATION ########################################


from LLAVA_Biovil.llava.mm_utils import tokenizer_image_token

iterator_train = iter(dataloader_train)
batch = next(iterator_train)

batch_llava_model_input_dict = batch["batch_llava_model_input_dict"]
batch_llava_model_input_dict = dataset.move_llava_model_input_dict_to_device(
    batch_llava_model_input_dict, device
)
_, images = (
    batch_llava_model_input_dict["text_prompt_input_ids"],
    batch_llava_model_input_dict["images"],
)

my_prompt = prompter.build_binary_qa_instruction_from_disease_under_study_with_confidence_request(
    "Cardiomegaly"
)
tokenized_prompt = tokenizer_image_token(my_prompt, tokenizer, return_tensors="pt").to(device)

stopping_criteria = KeywordsStoppingCriteria([STOP_STR], tokenizer, tokenized_prompt.unsqueeze(0))

prompt_and_generated_answers_ids = model.generate(
    input_ids=tokenized_prompt.unsqueeze(0),
    images=images[0].unsqueeze(0),
    # attention_mask=attention_mask,
    do_sample=True,
    use_cache=True,
    temperature=1.8,
    top_p=0.7,
    max_new_tokens=300,  # TODO maybe move to the kwargs
    stopping_criteria=[stopping_criteria],  # TODO understand better
    pad_token_id=tokenizer.pad_token_id,  # used in tokenizing after the generation, # TODO maybe move to the kwargs
    # **generation_kwargs_prediction,  # TODO check which args to pass.
)

tokenizer.decode(train.replace_image_token_with_another_token(prompt_and_generated_answers_ids)[0])

"<s> A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional, detailed, and polite answers to the user's questions. USER:  image . You are to act as a radiologist and answer a single question. After you respond, please provide your self evaluation of your confidence. Provide a confidence between 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, of how sure you are the answer is correct. A value close to 0 means you think there is a high probability that the answer is wrong. Your confidence is to be reported in a JSON dictionary of the following format: {'confidence': int}. Is the following disease visible in the given X-ray image: Cardiomegaly, and how confident are you?  ASSISTANT: No.</s>"