In [None]:
from data_utils import load_and_resize_images, vizualize_frames, TAGGED_CHAT_TEMPLATE
from transformers import AutoProcessor, BitsAndBytesConfig, Idefics2ForConditionalGeneration
from tqdm import tqdm
import torch
import copy
import json
import argparse
from termcolor import colored
from datasets import load_dataset

def eval_model(model, processor, test_set, verbose=False):
    model.eval()
    predictions = []
    device = model.device
    for idefics_sample in tqdm(test_set):
        image_paths = idefics_sample['images']
        images = load_and_resize_images(idefics_sample)
        if verbose:
            vizualize_frames(image_paths)
    
        prompt = processor.apply_chat_template(idefics_sample['user_message'], add_generation_prompt=True, chat_template=TAGGED_CHAT_TEMPLATE)
        inputs = processor(text=prompt, images=images, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
    
        # Generate
        generated_ids = model.generate(**inputs, max_new_tokens=500)
        generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
    
        predicted_text = generated_texts[0].split('\n')[-1][len("Assistant: "):]
        prediction = copy.deepcopy(idefics_sample)
        prediction['gt'] = prediction['answer']
        prediction['answer'] = predicted_text
        predictions.append(prediction)
        if verbose:
            print(colored(idefics_sample['question_text'], 'blue'))
            print(colored('Predicted:', 'blue'), predicted_text)
            print(colored('GT:', 'blue'), prediction['gt'])
    return predictions


In [None]:
checkpoint_path = "/home/cchang/CS503_VisualIntelligence/thinking-fast-and-furious/experiments/idefics2/models/ignore_index_added/checkpoint-200"
test_data_path='/home/cchang/CS503_VisualIntelligence/thinking-fast-and-furious/experiments/redcircle/data/nuscenes/test_ext_idefics_redcircle.json'
prediction_data_path='/home/jessica/EPFL/Course/CS503-VisualIntelligence/Project/thinking-fast-and-furious/experiments/idefics2/outputs/test-eval-idefics2-8b-ignore-index-added-200step.json'
processor = AutoProcessor.from_pretrained(
    "HuggingFaceM4/idefics2-8b",
    do_image_splitting=False
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
# Create inputs
finetuned_model = Idefics2ForConditionalGeneration.from_pretrained(
    checkpoint_path,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
)

test_idefics_dataset  = load_dataset('json', data_files=test_data_path, split='train')

predictions = eval_model(finetuned_model, processor, test_idefics_dataset[:10], verbose=True)
with open(prediction_data_path, "w") as f:
    json.dump(predictions, f)