In [None]:
# TODO

# TODO Reward function
# TODO Response format + response handling
# TODO Integrating supervised finetuning
# TODO minibatch save + eval checkpoints
# TODO arg parsing


# TODO determine hyperparams
# TODO dataloader num workers set to default
# TODO sampling logic (undersample)
# TODO binary qa prompt needs to vary between a few samples

In [1]:
# %% Set script for interactive development and import modules
from RewardingVisualDoubt import infrastructure

infrastructure.make_ipython_reactive_to_changing_codebase()
infrastructure.supress_known_warnings()

import pathlib as path
import typing as t
import torch
import numpy as np
from torch.utils.data import DataLoader

# from LLAVA_Biovil.llava.mm_utils import KeywordsStoppingCriteria
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer

from RewardingVisualDoubt import dataset, mimic_cxr, prompter, shared, vllm, train

DEFAULT_BATCH_SIZE = 8
DEFAULT_OUTPUT_DIR = path.Path("output")

Fetching 69 files:   0%|          | 0/69 [00:00<?, ?it/s]

In [None]:
batch_size = DEFAULT_BATCH_SIZE


######################################## 0. Define the environment ########################################

# TODO: Arg parsing etc
# if not os.path.exists(out_dir):
#     os.mkdir(out_dir)

# parameters = {
#     "experiment_name": out_dir,
#     "model_dir": model_dir,
#     "tokenizer_dir": tokenizer_dir,
#     "lr": lr,
#     "epochs": epochs,
#     "episode_length": episode_length,
#     "batchsize": batchsize,
# }
# with open(os.path.join(out_dir, "parameters.json"), "w") as outfile:
#     json.dump(parameters, outfile)

device_str = (
    shared.torch_devices.cuda.value if torch.cuda.is_available() else shared.torch_devices.cpu.value
)
device = torch.device(device_str)

######################################## 1. Load the model and tokenizer ########################################

model = vllm.load_pretrained_llava_model_for_ppo_training(device_str=device_str)
# model_ref = vllm.load_pretrained_llava_model_for_ppo_training(device_str=device_str)

tokenizer = vllm.load_pretrained_llava_tokenizer_with_image_support(
    model_base=vllm.LLAVA_BASE_MODEL_NAME
)
padding_tokenizer = vllm.load_pretrained_llava_tokenizer_with_image_support(
    model_base=vllm.LLAVA_BASE_MODEL_NAME
)
padding_tokenizer.padding_side = "left"  # Why? Because: A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.

# TODO do i need a tokenizer dir? # tokenizer = load_tokenizer(tokenizer_dir)
# TODO: need padding from the left???
#### model.config.tokenizer_padding_side = "left"  # RaDialog loading logic handles it alreay
#### model.padding_side='left' - PAUL DOES IT
#### model.pad_token_id = tokenizer.eos_token_id - PAUL DOES IT

Loading model in non-trainable mode...
Model base:  liuhaotian/llava-v1.5-7b
Loading LLaVA from base model...


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading additional LLaVA weights...
Using downloaded and verified file: /tmp/biovil_t_image_model_proj_size_128.pt
Loaded additional vision tower weights...
Adding pretrained RaDialog LoRA adapters and value head to the model...


In [33]:
######################################## 2. Load the datasets and the dataloaders ########################################

dataset_train = dataset.get_binary_qa_prompted_mimic_cxr_llava_model_input_dataset(
    split=dataset.DatasetSplit.TRAIN,
    tokenizer=tokenizer,
    prompter=prompter.build_binary_qa_instruction_from_disease_under_study,
)
dataset_eval = dataset.get_binary_qa_prompted_mimic_cxr_llava_model_input_dataset(
    split=dataset.DatasetSplit.VALIDATION,
    tokenizer=tokenizer,
    prompter=prompter.build_binary_qa_instruction_from_disease_under_study,
)

padding_tokenizer.pad_token = padding_tokenizer.bos_token  # TODO how about this?

dataloader_train = dataset.get_mimic_cxr_llava_model_input_dataloader(
    dataset=dataset_train,
    batch_size=batch_size,
    padding_tokenizer=padding_tokenizer,
    num_workers=8,  # Let Torch decide.
)

dataloader_eval = dataset.get_mimic_cxr_llava_model_input_dataloader(
    dataset=dataset_eval,
    batch_size=2 * batch_size,
    padding_tokenizer=padding_tokenizer,
    num_workers=8,  # Let Torch decide.
)

In [10]:
######################################## 3. Define the PPO and generation configurations ########################################
epochs = 1
lr = 1e-5
log_with = "foo"
out_dir = "to-be-define"

ppo_config = PPOConfig(
    learning_rate=lr,
    task_name="gpt",
    batch_size=batch_size,
    mini_batch_size=int(batch_size / 4),
    # log_with=log_with,
    # project_kwargs={"logging_dir": out_dir},
    remove_unused_columns=False,
    # optimize_device_cache=True,
    init_kl_coef=0.05,
)

# Probably not needed for my model
# ppo_terminators = [
#     tokenizer.eos_token_id,
#     tokenizer.convert_tokens_to_ids("<|eot_id|>"),
# ]

generation_kwargs_ppo = {
    "min_length": -1,  # don't ignore the EOS token (see above)
    "top_k": 0.0,  # no top-k sampling
    "top_p": 1.0,  # no nucleus sampling
    "do_sample": True,  # yes, we want to sample
    "pad_token_id": tokenizer.pad_token_id,  # most decoder models don't have a padding token - use EOS token instead
    "max_new_tokens": 32,
    "eos_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 500,
}

# TODO do i need a ppotrainernocache?
ppo_trainer = PPOTrainer(
    model=model,
    config=ppo_config,
    tokenizer=tokenizer,
)


# TODO: Other configs
# prediction_terminators = [
#     tokenizer.eos_token_id,
#     tokenizer.convert_tokens_to_ids("<|eot_id|>"),
#     tokenizer.convert_tokens_to_ids("Ä Confidence"),
# ]

# generation_kwargs_prediction = {
#     "max_new_tokens": 256,
#     "eos_token_id": prediction_terminators,
#     "do_sample": True,
#     "temperature": 0.6,
#     "top_p": 0.9,
#     "pad_token_id": tokenizer.eos_token_id,
# }


# eval_terminators = [
#     tokenizer.eos_token_id,
#     tokenizer.convert_tokens_to_ids("<|eot_id|>"),
# ]

# generation_kwargs_eval = {
#     "max_new_tokens": 256,
#     "eos_token_id": eval_terminators,
#     "do_sample": True,
#     "temperature": 0.6,
#     "top_p": 0.9,
#     "pad_token_id": tokenizer.eos_token_id,
# }



In [None]:
STOP_STR = prompter.Seperator.END_OF_SEQUENCE_SEPERATOR.value
from LLAVA_Biovil.llava.mm_utils import KeywordsStoppingCriteria

# BATCH LOGIC
iterator_train = iter(dataloader_train)
batch = next(iterator_train)

batch = t.cast(dataset.MimicCxrLlavaModelInputBatchDict, batch)
batch_llava_model_input_dict = batch["batch_llava_model_input_dict"]
batch_llava_model_input_dict = dataset.move_llava_model_input_dict_to_device(
    batch_llava_model_input_dict, device
)
input_ids, images = (
    batch_llava_model_input_dict["text_prompt_input_ids"],
    batch_llava_model_input_dict["images"],
)
attention_mask = batch["batch_attention_mask"].to(device)  # TODO handle elsewhere
labels = batch["batch_labels"].to(device)  # TODO handle elsewhere

model.eval()
stopping_criteria = KeywordsStoppingCriteria([STOP_STR], tokenizer, input_ids)

tokenizer.padding_side = "left"
model.config.padding_side = "left"
model.config.tokenizer_padding_side = "left"
# model.pad_token_id = tokenizer.eos_token_id

prediction = model.generate(
    input_ids=input_ids,
    images=images,
    attention_mask=attention_mask,
    do_sample=False,
    use_cache=True,
    max_new_tokens=300,  # TODO maybe move to the kwargs
    stopping_criteria=[stopping_criteria],  # TODO understand better
    pad_token_id=tokenizer.pad_token_id,  # used in tokenizing after the generation, # TODO maybe move to the kwargs
    # **generation_kwargs_prediction,  # TODO check which args to pass.
)

# TODO: should this be here? should i add the token to tokenizer etc?
IMAGE_TOKEN_INDEX = -200
prediction[prediction == IMAGE_TOKEN_INDEX] = (
    1967  # 1967 is the index of the image token in the tokenizer (the word image)
)

######################################## SUPERVISED FINETUNING ########################################
# User asks a randomly sampled question about confidence
# Assistant starts answerring the question with "Confidence:"
# This step is learnt via supervised finetuning

######################################## PPO TRAINING ########################################
# AFTER SUPERVISED FINETUNING STEP, WE GO BACK TO THE POINT WHERE USER ASKED THE QUESTION ABOUT CONFIDENCE
# NOW THE ASSISTANT SHOULD COME UP WITH "My confidence score from (rated from 0 to 10) Confidence:
# THE RESPONSE TEMPLATE, AND THE ACTUAL CONFIDENCE VALUE (AND ITS BEING WITHIN CONFIDENCE RANGE) WILL BE REWARDED


# TODO: Remove padding elsewhere via padding_tokenizer
# omg define an efficient function for this lol
prediction_trimmed = [train.remove_padding(p, tokenizer.pad_token_id) for p in prediction]

# TODO: Should i remove pretrailing paddings before passing in to ppo? might not be worth the computation effort

model.train()
# PPO_TRAINER AUTOMATICALLY PADS THE INPUTS BY TOKENIZER.PADDING_SIDE AND TOKENIZER.PADDING_TOKEN_ID
# TODO: Uh-oh, because ppo termination token is set as the eos_seq_token, it'll stop when it sees a left padded sequence
response_tensors = ppo_trainer.generate(
    prediction_trimmed,
    return_prompt=False,
    **generation_kwargs_ppo  # TODO: check if return_prompt is good
    # TODO probably need to consolidate with stopping criteria
)

total_tensor = [torch.cat((p, c), 0) for p, c in zip(prediction_trimmed, response_tensors)]
answer_only_tensor = [total_tensor[i][len(input_ids[i]) :] for i in range(len(input_ids))]


# TODO: RANDOM EXPLORATION MAGIC For random exploration
# if np.random.random() < chance_to_change_confidence:
#     answer_only_tensor = [change_confidence(a, confidences_tokens, np.random.choice(confidences_tokens)) for a in answer_only_tensor]

responses_decoded = tokenizer.batch_decode(answer_only_tensor, skip_special_tokens=True)


# TODO: Response parsing and rewards generation magic!

In [83]:
responses_decoded

['Yes, the image shows atelectasis.',
 'Yes, the patient has pleural effusion.',
 'Yes, there is evidence of that in the image.',
 'Yes, the image shows pleural effusion.',
 'No, there is no evidence of that in the image.',
 'No, there is no evidence of that in the image.',
 'No, there is no evidence of that in the image.',
 'Yes, the image shows pleural effusion.  Based on the visual inspection of the chest x-ray image, there is fluid accumulation in the space between the two layers of the lung (the pleural space) on both sides of the chest.  This fluid accumulation is visible as an abnormal white area on the x-ray image.  The size of the pleural effusion appears to be moderate.  However, it is important to note that further evaluation and imaging may be needed to fully assess the extent and depth of the pleural effusion.']

In [82]:
responses_decoded = tokenizer.batch_decode(answer_only_tensor, skip_special_tokens=True)

In [77]:
temp = input_ids[0].clone().detach()
temp[input_ids[0] == -200] = 1967
tokenizer.decode(temp)

"<s><s>A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional, detailed, and polite answers to the user's questions. USER:  image . You are to act as a radiologist and answer the following question: Is the following disease visible in the given X-ray image: Atelectasis?  ASSISTANT:"

In [53]:
tokenizer.decode(prediction[0, input_ids.shape[1] :])

' Yes, the image shows atelectasis.</s></s></s>'

In [73]:
tokenizer.decode(prediction_trimmed[0][input_ids.shape[1] :])

'Yes, the image shows ate'

In [74]:
tokenizer.decode(response_tensors[0])

' lectasis.</s>'

In [81]:
tokenizer.decode(total_tensor[0])

"<s><s> A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional, detailed, and polite answers to the user's questions. USER:  image . You are to act as a radiologist and answer the following question: Is the following disease visible in the given X-ray image: Atelectasis?  ASSISTANT: Yes, the image shows atelectasis.</s>"

In [80]:
tokenizer.decode(answer_only_tensor[0])

' Yes, the image shows atelectasis.</s>'

In [6]:
IMAGE_TOKEN_INDEX = -200
prediction[prediction == IMAGE_TOKEN_INDEX] = (
    1967  # 1967 is the index of the image token in the tokenizer (the word image)
)
tokenizer.batch_decode(prediction)

["</s><s> A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional, detailed, and polite answers to the user's questions. USER:  image . You are to act as a radiologist and answer the following question: Is the following disease visible in the given X-ray image: Atelectasis?  ASSISTANT: Yes, the image shows atelectasis.</s></s></s>",
 "<s> A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional, detailed, and polite answers to the user's questions. USER:  image . You are to act as a radiologist and answer the following question: Is the following disease visible in the given X-ray image: Pleural Effusion?  ASSISTANT: Yes, the patient has pleural effusion.</s></s></s>",
 "</s><s> A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional,

In [58]:
response_tensors

[tensor([2], device='cuda:0'),
 tensor([    1,   319, 13563,  1546,   263, 12758,  1404,   322,   385, 23116,
         21082, 20255, 16684,   408,   385, 18860, 17937, 19915, 29889,   450,
         20255,  4076, 10257, 29892, 13173, 29892,   322,  1248,   568,  6089,
           304,   278,  1404, 29915, 29879,  5155, 29889,  3148,  1001, 29901,
         29871,     0,   869,   887,   526,   304,  1044,   408,   263, 17937,
         19915,   322,  1234,   278,  1494,  1139, 29901,  1317,   278,  1494,
         17135,  7962,   297,   278,  2183,  1060, 29899,   764,  1967, 29901,
         19777,  3631,   382,   600,  3958, 29973, 29871,   319,  1799,  9047,
         13566, 29901,  3869, 29892,   278, 16500,   756,  5644,  3631,  1801,
          3958, 29889,     2], device='cuda:0'),
 tensor([2], device='cuda:0'),
 tensor([    1,   319, 13563,  1546,   263, 12758,  1404,   322,   385, 23116,
         21082, 20255, 16684,   408,   385, 18860, 17937, 19915, 29889,   450,
         20255,  407

In [None]:
######################################## 4. Get trainer and set training aspirations ########################################


# For random exploration
confidences = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
confidences = [str(c) for c in confidences]
confidences_tokens = tokenizer.convert_tokens_to_ids(confidences)
chance_to_change_confidence = 0
steps_to_reach_zero = len(ppo_trainer.dataloader)
reduce_per_step = chance_to_change_confidence / steps_to_reach_zero

best_reward = -100
best_reward_epoch = -1

######################################## 5. Train the model ########################################

for epoch in range(epochs):

    rewards_epoch = []
    for idx, batch in enumerate(dataloader_train):

        # LOGIC CURRENTLY IN TRIAL IN ANOTHER CELL

        prediction = [remove_padding(p, padding_tokenizer.pad_token_id) for p in prediction]

        # TODO: Figure out the padding logic here
        # Generate confidence
        model.train()
        response_tensors = ppo_trainer.generate(
            prediction, return_prompt=False, **generation_kwargs_ppo
        )

        # Create prediction + confidence output
        total_tensor = [torch.cat((p, c), 0) for p, c in zip(prediction, response_tensors)]
        answer_only_tensor = [total_tensor[i][len(input_ids[i]) :] for i in range(len(input_ids))]

        # For random exploration
        if np.random.random() < chance_to_change_confidence:
            answer_only_tensor = [
                change_confidence(a, confidences_tokens, np.random.choice(confidences_tokens))
                for a in answer_only_tensor
            ]

        responses_decoded = tokenizer.batch_decode(answer_only_tensor, skip_special_tokens=True)

        # Parse prediction and confidence
        results = [
            response_to_QAResult(question, response, gt, is_mc)
            for question, response, gt, is_mc in zip(
                questions, responses_decoded, gt_candidates, is_multiple_choice
            )
        ]

        # Compute rewards
        rewards = [QAResult_to_reward(r) for r in results]
        rewards_epoch += rewards
        rewards = [torch.tensor(r).to(device) for r in rewards]

        # Create log data
        batch["response"] = responses_decoded
        batch["query"] = batch["question"]

        stats = ppo_trainer.step(prediction, response_tensors, rewards)

        ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=["query", "response", "answer"])

        # For random exploration
        chance_to_change_confidence -= reduce_per_step
        chance_to_change_confidence = max(0, chance_to_change_confidence)

    avg_reward = np.mean(rewards_epoch)

    print(f"Finished epoch {epoch}. Average reward: {avg_reward}")
    ppo_trainer.save_pretrained(os.path.join(out_dir, "model_finetuned"))

    # Evaluate model after each epoch
    model.eval()
    mean_reward, std_reward = evaluate_model(
        model, dataloader_validation, tokenizer, generation_kwargs_eval, device
    )
    if log_with == "wandb":
        wandb.log({"mean_reward_evaluation": mean_reward})
        wandb.log({"std_reward_evaluation": std_reward})
        wandb.log({"exploration_prob": chance_to_change_confidence})

    # Save the best performing model
    mean_reward = avg_reward
    if mean_reward > best_reward:
        ppo_trainer.save_pretrained(os.path.join(out_dir, "model_finetuned_best"))
        best_reward = mean_reward
        best_reward_epoch = epoch

print("Finished Training!")
print(f"Best avg reward {best_reward} in epoch {best_reward_epoch}")
return

In [160]:
POST_GENERATION_CONFIDENCE_REQUEST_1 = (
    "USER: Now evaluate your own response. How confident are you in your answer? "
    "Provide a confidence between 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, of how sure you are the answer is correct. "
    "A value close to 0 means you think there is a high probability that the answer is wrong. "
    "The closer the value is to 10, the higher you think is the probability that the answer is correct. "
    "The output should have the format 'Confidence: <confidence>' and nothing else. "
)


POST_GENERATION_CONFIDENCE_REQUEST_2 = (
    "USER: Now evaluate your own response. How confident are you in your answer? "
    "Provide a confidence between 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, of how sure you are the answer is correct. "
    "The output should have the format 'My Confidence: <confidence>'. "
    "ASSISTANT: Here is my self evaluation of my confidence. My Confidence (out of 10):"
)

POST_GENERATION_CONFIDENCE_REQUEST_3 = "USER: Irrelevant from my question, what is your name?"

POST_GENERATION_CONFIDENCE_REQUEST_4 = (
    "USER: Now evaluate your own response. How confident are you in your answer? "
    "Provide a confidence between 0, 1, 2, 3, 4, 5 of how sure you are the answer is correct. "
    "The output should have the format 'My Confidence: <confidence>' My Reasoning for my Self Confidence Evaluation: <reasoning>. "
    "ASSISTANT: Here is my self evaluation of my confidence. My Confidence (out of 5):"
)

POST_GENERATION_CONFIDENCE_REQUEST_5 = (
    "USER: Now evaluate your own response. How confident are you in your answer? "
    "Provide a confidence between 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  of how sure you are the answer is correct. "
    "The output should have the format 'My Confidence: <confidence>' My Very Short Reasoning for my Self Confidence Evaluation: <very_short_reasoning>. "
    "ASSISTANT: Here is my self evaluation of my confidence. My Confidence (out of 10):"
)

In [161]:
from RewardingVisualDoubt import inference

padding_tokenizer = vllm.load_pretrained_llava_tokenizer_with_image_support(
    model_base=vllm.LLAVA_BASE_MODEL_NAME
)
padding_tokenizer.padding_side = "left"
padding_tokenizer.pad_token_id = padding_tokenizer.bos_token_id
dataset_test = dataset.get_binary_qa_prompted_mimic_cxr_llava_model_input_dataset(
    split=dataset.DatasetSplit.TEST,
    tokenizer=tokenizer,
    prompter=prompter.build_binary_qa_instruction_from_disease_under_study,
)
dataloader_test = dataset.get_mimic_cxr_llava_model_input_dataloader(
    dataset=dataset_test, batch_size=1, padding_tokenizer=padding_tokenizer, num_workers=8
)

for idx, batch in enumerate(dataloader_test):
    batch = t.cast(dataset.MimicCxrLlavaModelInputBatchDict, batch)
    batch_llava_model_input_dict = batch["batch_llava_model_input_dict"]
    batch_llava_model_input_dict = dataset.move_llava_model_input_dict_to_device(
        batch_llava_model_input_dict, torch.device(shared.torch_devices.cuda.value)
    )
    input_ids, images = (
        batch_llava_model_input_dict["text_prompt_input_ids"],
        batch_llava_model_input_dict["images"],
    )
    stopping_criteria = KeywordsStoppingCriteria([STOP_STR], tokenizer, input_ids)
    pred = inference.generate_radialog_answer_for_binary_qa_for_single_study(
        model, tokenizer, input_ids, images, stopping_criteria
    )
    confidence_request_prompt = (
        batch["batch_prompts"][0] + " " + pred + " " + POST_GENERATION_CONFIDENCE_REQUEST_5
    )
    confidence_request_input_ids = torch.unsqueeze(
        torch.IntTensor(tokenizer(confidence_request_prompt)["input_ids"]), 0
    ).to(device)
    stopping_criteria = KeywordsStoppingCriteria(
        [STOP_STR], tokenizer, confidence_request_input_ids
    )
    pred_with_confidence = inference.generate_radialog_answer_for_binary_qa_for_single_study(
        model, tokenizer, confidence_request_input_ids, images, stopping_criteria
    )
    print(f"\n Metadata: {batch['batch_mimic_cxr_datapoint_metadata']}")
    print(f"Prompt: {batch['batch_prompts']}")
    print(f"Label:", batch["batch_labels"])
    print(f"File_idx {idx}, ASSISTANT: ", pred)
    print(f"File_idx {idx}, ASSISTANT (after confidence request): ", pred_with_confidence)
    if idx == 10:
        break


 Metadata: [MimicCxrBinaryQADatapoint(subject_id=18460230, study_id=53631792, img_path='/home/data/DIVA/mimic/mimic-cxr-jpg/2.0.0/files/p18/p18460230/s53631792/369dc5bd-70bd89d0-2d90fa80-f319ec1d-fb2802aa.jpg', disease=<ChexpertFinding.PLEURAL_EFFUSION: 'Pleural Effusion'>, label=<ChexpertLabel.POSITIVE: 1.0>)]
Prompt: ["A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional, detailed, and polite answers to the user's questions. USER: <image>. You are to act as a radiologist and answer the following question: Is the following disease visible in the given X-ray image: Pleural Effusion?  ASSISTANT:"]
Label: tensor([1.])
File_idx 0, ASSISTANT:  Yes, the image shows pleural effusion.
File_idx 0, ASSISTANT (after confidence request):  8. My reasoning: The image shows a clear and well-defined pleural effusion, which is a common finding in patients with heart failure. However, the image also shows some are

In [136]:
confidence_request_input_ids

tensor([[    1,   319, 13563,  1546,   263, 12758,  1404,   322,   385, 23116,
         21082, 20255, 16684,   408,   385, 18860, 17937, 19915, 29889,   450,
         20255,  4076, 10257, 29892, 13173, 29892,   322,  1248,   568,  6089,
           304,   278,  1404, 29915, 29879,  5155, 29889,  3148,  1001, 29901,
           529,  3027, 15513,   887,   526,   304,  1044,   408,   263, 17937,
         19915,   322,  1234,   278,  1494,  1139, 29901,  1317,   278,  1494,
         17135,  7962,   297,   278,  2183,  1060, 29899,   764,  1967, 29901,
          9160, 14910,   387, 14997, 29973, 29871,   319,  1799,  9047, 13566,
         29901,  3869, 29892,   278, 16500,   756,  5881, 14910,   387, 14997,
         29889,  3148,  1001, 29901,  6527,   366, 12312,   825,   366,  1497,
         29973]], device='cuda:0', dtype=torch.int32)

In [138]:
tokenizer.encode("<image>")

[1, 529, 3027, 29958]

In [135]:
tokenizer.decode(confidence_request_input_ids[0])

"<s>A chat between a curious user and an artificial intelligence assistant acting as an experienced radiologist. The assistant gives professional, detailed, and polite answers to the user's questions. USER: <image>. You are to act as a radiologist and answer the following question: Is the following disease visible in the given X-ray image: Cardiomegaly?  ASSISTANT: Yes, the patient has cardiomegaly. USER: Could you repeat what you said?"

In [148]:
stopping_criteria.start_len

106