In [1]:
import wandb

with open("/mnt/nlpdata1/home/ismayilz/.wandb.key", "r") as f:
    wandb_key = f.read().strip()
    
wandb.login(key=wandb_key)
wandb.init(
    project="thinking-fast-and-furious",
)
wandb_name=wandb.run.name

ModuleNotFoundError: No module named 'wandb'

In [1]:
import torch
from peft import LoraConfig
from transformers import AutoProcessor, BitsAndBytesConfig, VipLlavaForConditionalGeneration
from transformers.image_utils import load_image
import json
import matplotlib.pyplot as plt
from PIL import Image, ImageOps, ImageDraw
from typing import Dict
from tqdm import tqdm
import copy

DEVICE = "cuda:0"
USE_LORA = False
USE_QLORA = False

In [2]:
IMAGE_DIR = '/mnt/nlpdata1/home/ismayilz/cs503-project/data/train/nuscenes/samples'
IMAGE_PATH_PREFIX = '../nuscenes/samples'
IMAGE_SRC_X, IMAGE_SRC_Y = 1600, 900
IMAGE_TGT_X, IMAGE_TGT_Y = int(IMAGE_SRC_X / 2.5), int(IMAGE_SRC_Y / 2.5)

train_data_path = '/mnt/nlpdata1/home/ismayilz/cs503-project/data/train/nuscenes/v1_1_train_nus_ext.json'
test_data_path = '/mnt/nlpdata1/home/ismayilz/cs503-project/thinking-fast-and-furious/drivelm/challenge/test_eval.json'
vipllava_train_data_path = '/mnt/nlpdata1/home/ismayilz/cs503-project/thinking-fast-and-furious/experiments/vip-llava/data/nuscenes/train_vip_llava.json'
vipllava_test_data_path = '/mnt/nlpdata1/home/ismayilz/cs503-project/thinking-fast-and-furious/experiments/vip-llava/data/nuscenes/test_vip_llava.json'
# checkpoint_dir = f"/home/cchang/CS503_VisualIntelligence/thinking-fast-and-furious/baseline/experiments/eea/models/idefics_redcircle/{wandb_name}"
# checkpoint_dir = "/mnt/nlpdata1/home/ismayilz/cs503-project/models/idefics2-redcircle-prime-music-8-500step/checkpoint-500"
# checkpoint_dir = "/mnt/nlpdata1/home/ismayilz/cs503-project/models/idefics2-redcircle-fearless-morning-10-1650step/checkpoint-1650"
checkpoint_dir = "llava-hf/vip-llava-7b-hf"

In [3]:
processor = AutoProcessor.from_pretrained(
    checkpoint_dir,
    do_image_splitting=False
)


# Three options for training, from the lowest precision training to the highest precision training:
# - QLora
# - Standard Lora
# - Full fine-tuning
if USE_QLORA or USE_LORA:
    lora_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        # target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$',
        use_dora=False if USE_QLORA else True,
        init_lora_weights="gaussian"
    )
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
    model = VipLlavaForConditionalGeneration.from_pretrained(
        checkpoint_dir,
        torch_dtype=torch.float16,
        quantization_config=bnb_config if USE_QLORA else None,
    )
    model.add_adapter(lora_config)
    model.enable_adapters()
else:
    model = VipLlavaForConditionalGeneration.from_pretrained(
        checkpoint_dir,
        torch_dtype=torch.float16,
        # _attn_implementation="flash_attention_2", # Only available on A100 or H100
    ).to(DEVICE)



preprocessor_config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/70.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

In [4]:
import pathlib

def vizualize_frames(image_paths):
    y_view_mapping = {"MIDDLE": 1, "LEFT": 0, "RIGHT": 2}
    fig, axes = plt.subplots(2, 3, figsize=(48, 18))
    for i, (image_view, image_path) in enumerate(image_paths.items()):
        # image = Image.open(image_path)
        image=copy.deepcopy(image_path)
        _, x, y = f"{image_view}_MIDDLE".split("_")[:3]
        x_id = int(x == 'BACK')
        axes[x_id][y_view_mapping[y]].imshow(image)
        axes[x_id][y_view_mapping[y]].set_title(image_view)
        axes[x_id][y_view_mapping[y]].axis('off')
    plt.show()
    
def process_scene(scene_id, scene):
    samples = []
    for frame_id, frame in scene['key_frames'].items():
        image_paths = {view_name: view_path.replace(IMAGE_PATH_PREFIX, IMAGE_DIR) for view_name, view_path in frame['image_paths'].items()}
        assert len(image_paths) == 6, "not all views provided"
        question_id = 0
        for question_type, questions in frame['QA'].items():
            for question_info in questions:
                question = question_info['Q']
                answer = question_info['A'] if "A" in question_info else ""
                sample_id = f"{scene_id}_{frame_id}_{question_id}"
                question_id += 1
                question_text = question
                samples.append({
                    "id": sample_id, #change key here from sample_id to id
                    "question_type": question_type,
                    "question_text": question_text,
                    "images": image_paths,
                    "answer": answer,
                    "tag": question_info["tag"]
                })
    return samples


def process_dataset(data_path, output_path=None):
    with open(data_path, "r") as f:
        dataset: Dict[str, str] = json.load(f)
    samples = []
    for scene_id, scene in tqdm(dataset.items()):
        samples.extend(process_scene(scene_id, scene))
    if output_path:
        pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, "w") as f:
            json.dump(samples, f, indent=4)
    return samples

import re
def objects_to_dict(question):
    # get all objects in the question
    objects = re.findall(r'<[^>]*>', question)
    unique_objects = list(set(objects))
    result = {}
    for obj in unique_objects:
        # Remove '<' and '>' and split by comma
        parts = obj.strip('<>').split(',')
        # The identifier seems to be the second element based on your example
        identifier = parts[1]
        # Coordinates are the last two elements
        coordinates = [float(parts[2]), float(parts[3])]
        # Check if the identifier already exists in the dictionary
        if identifier in result:
            # Append the new coordinates to the existing list
            result[identifier].append(coordinates)
        else:
            # Otherwise, create a new list with the coordinates
            result[identifier] = [coordinates]
    # result will look like {'CAM_BACK': [[1088.3, 497.5]], 'CAM_FRONT': [[1043.2, 82.2]]}
    return result

def draw_circle(image_path,image_key, objects, colors=["red"]):
    image = load_image(image_path)
    assert len(objects) <= len(colors)

    if image_key in objects.keys() and bool(objects):
        for coordinate, color in zip(objects[image_key], colors):
            draw = ImageDraw.Draw(image)
            # Define the radius of the circle and the color
            # Base on paper: we draw red circles over the images, with radius r = 0.06H and thickness t = 0.01H, where H is the shorter side of the image.
            H= min(image.size)
            radius = 0.06 * H
            thickness = 0.01 * H
            x = float(coordinate[0])
            y = float(coordinate[1])
            # Calculate the bounding box of the circle to be drawn
            left_up_point = (int(x - radius), int(y - radius))
            right_down_point = (int(x + radius), int(y + radius))
            draw.ellipse([left_up_point, right_down_point], outline=color, fill=None, width=int(thickness))
            #for checking center
            # radius_center=10
            # left_up_point = (int(x - radius_center), int(y - radius_center))
            # right_down_point = (int(x + radius_center), int(y + radius_center))
            # draw.ellipse([left_up_point, right_down_point],fill='blue')

    return image

def construct_for_viz(image_paths,images):
    for i,key in enumerate(image_paths.keys()):
        image_paths[key]=images[i]
    return image_paths

def eval_model(model, test_set, apply_visual_cue=True, apply_vb=True, verbose=False):
    predictions = []
    for llava_sample in tqdm(test_set):
        image_paths = llava_sample['images']

        if apply_visual_cue:
            objects = objects_to_dict(llava_sample['question_text'])
            # images = [load_image(image_path).resize((IMAGE_TGT_X, IMAGE_TGT_Y)) for image_path in image_paths.values()]
            colors = ["red", "blue", "black", "white"]
            images = [draw_circle(image_paths[image_key], image_key, objects, colors=colors).resize((IMAGE_TGT_X, IMAGE_TGT_Y)) for image_key in image_paths.keys()]
        else:
            images = [load_image(image_path).resize((IMAGE_TGT_X, IMAGE_TGT_Y)) for image_path in image_paths.values()]

        question = llava_sample['question_text']
        image_tokens = " ".join(["<image>"] * len(images))
        prompt = f"A chat between a human driver and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: {image_tokens}\n{question}\n###Assistant:"

        if apply_vb:
            raw_objects = re.findall(r'<[^>]*>', llava_sample['question_text'])
            for object, color in zip(raw_objects, colors[:len(objects)]):
                prompt = prompt.replace(object, f"the object marked with {color} circle")
                prompt = prompt.replace("object the object", "the object")
        
        inputs = processor(text=prompt, images=images, return_tensors="pt")
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

        if verbose:
            image_viz = construct_for_viz(copy.deepcopy(image_paths),images)
            vizualize_frames(image_viz)
            print('objects:',objects)

        # Generate
        generated_ids = model.generate(**inputs, max_new_tokens=500)
        generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
    
        predicted_text = generated_texts[0].split('\n')[-1][len("###Assistant:"):].strip()
        # predicted_text = generated_texts[0]
        prediction = copy.deepcopy(llava_sample)
        prediction['gt'] = prediction['answer']
        prediction['answer'] = predicted_text
        predictions.append(prediction)
        if verbose:
            print(prompt)
            print('Predicted:', predicted_text)
            print('GT:', prediction['gt'])
    return predictions

In [5]:
# train_dataset = process_dataset(train_data_path)
test_dataset = process_dataset(test_data_path)

# pathlib.Path(vipllava_test_data_path).parent.mkdir(parents=True, exist_ok=True)
# with open(vipllava_test_data_path, "w") as f:
#     json.dump(test_dataset, f, indent=4)

100%|██████████| 2/2 [00:00<00:00, 10894.30it/s]


In [8]:
zero_shot_predictions = eval_model(model, test_dataset, apply_visual_cue=True, apply_vb=True)

100%|██████████| 66/66 [01:43<00:00,  1.57s/it]


In [7]:
zero_shot_path = "/mnt/nlpdata1/home/ismayilz/cs503-project/thinking-fast-and-furious/experiments/vip-llava/outputs/test-eval-vip-llava-7b-zero-shot-visual-cue-vb.json"
pathlib.Path(zero_shot_path).parent.mkdir(parents=True, exist_ok=True)
with open(zero_shot_path, "w") as f:
    json.dump(zero_shot_predictions, f, indent=4)