In [15]:
import os
import torch
import logging

# if current directory is not 'vqa-sdg', change it
if os.path.basename(os.getcwd()) != 'vqa-sdg':
    os.chdir('vqa-sdg')

In [16]:
import json
from PIL import Image
import torchvision 
from torchvision.io import read_image 
from torchvision.utils import draw_bounding_boxes 
from time import time

IMG_1 = read_image("./dataset/img/1072.jpg")
IMG_2 = read_image("./dataset/img/1159835.jpg")
IMG_3 = read_image("./dataset/img/1591896.jpg")

with open("./dataset/feat/sceneGraphs.json") as json_file:
    SCENE_GRAPH = json.load(json_file)

In [108]:
def get_filename(long_path: str) -> str:
    filename = os.path.basename(long_path)
    raw_filename = os.path.splitext(filename)[0]

    return filename, raw_filename



def annotate_images(img_path, graph, num_obj=5, min_area_div=100): #TODO: Changing
    COLORS = ["red", "green", "blue", "yellow", "purple", "orange", "brown", "pink", "gray", "cyan"]
    img = torchvision.io.read_image(img_path)

    img_area = img.size()[-1] * img.size()[-2]
    annotated_imgs = []
    bboxs = []

    # Create num_obj images with only one annotation
    for v in list(graph["objects"].values()):
        x, y, w, h = v["x"], v["y"], v["w"], v["h"]
        if w * h * min_area_div < img_area:
            continue
        
        bbox = [x, y, x + w, y + h]
        bboxs.append(bbox)        
        bbox = torch.tensor([bbox])

        img_tensor = torchvision.utils.draw_bounding_boxes(
            img, bbox, width=3, colors=["red"]
        )
        img_pil = torchvision.transforms.ToPILImage()(img_tensor)

        name = v["name"]
        annotated_imgs.append((name, img_pil))

        num_obj -= 1
        if num_obj == 0:
            break
    
    # Draw all annotations on the image
    complete_annot_img_tensor = torchvision.utils.draw_bounding_boxes(
        img, torch.tensor(bboxs), width=3, colors=COLORS[:len(bboxs)]
    )

    return annotated_imgs, complete_annot_img_tensor



def inference_hf(
    model,
    processor,
    prompt,
    api_key=None,
    boilerplate_prompt=True,
    img_path=None,
    img_raw=None,
    max_new_tokens=1500,
    do_sample=False,
    skip_special_tokens=True,
) -> (str, float):
    start_time = time()
    if img_raw is None:
        try:
            img_raw = Image.open(img_path)
        except Exception as e:
            return str(e)

    if boilerplate_prompt:
        prompt = "USER: <image>\n" + prompt + "\nASSISTANT:"

    inputs = processor(prompt, img_raw, return_tensors="pt").to(0, torch.float16)
    raw_output = model.generate(
        **inputs, max_new_tokens=max_new_tokens, do_sample=do_sample
    )
    output = processor.decode(raw_output[0], skip_special_tokens=skip_special_tokens)

    if boilerplate_prompt:
        output = output[output.index("ASSISTANT:") + 11 :]

    end_time = time()
    seconds = end_time - start_time

    return output + "\n", seconds

In [109]:
# TODO: Change
def nonvis_inference_runner(
    model,
    processor,
    prompt_primary,
    img_path,
    scene_graph,
    runner_config: dict,
    prompt_inter="",
):
    print("nonvis_inference_runner", runner_config['pair_num'])
    img_id_ext, img_id = get_filename(img_path)
    raw_objs, complete_annot_tensor = annotate_images(
        img_path, scene_graph[img_id], num_obj=runner_config["pair_num"]
    )

    logging.info(f"[{img_id_ext}] - Nonvis Inference started...")

    outs = []
    total_sec = 0

    for i, obj in enumerate(raw_objs):
        out, sec = inference_hf(
            model,
            processor,
            prompt_primary.format(number=i + 1, name=obj[0]),
            img_raw=obj[1],
        )
        outs.append(out)
        total_sec += sec

    logging.info(f"[{img_id_ext}] - Nonvis Inference finished ({sec}s)")

    return "\n".join(outs), total_sec, "", 0, {"id": img_id ,"complete_tensor": complete_annot_tensor}

In [110]:
import torch
from PIL import Image
import random
import numpy as np
import torch
from transformers import LlavaForConditionalGeneration, VipLlavaForConditionalGeneration, AutoModel, AutoProcessor, AutoModelForCausalLM, AutoModelForPreTraining

def set_seed(seed: int) -> None:
    """
    Set the seed for random number generators for reproducibility.

    Args:
    - seed (int): The seed value.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


def load_model(model_path, model_family, low_cpu_mem_usage, device = "cuda", seed = 42):
    
    MODEL_LOADER_DICT = {
        "llava" : LlavaForConditionalGeneration,
        "vip_llava": VipLlavaForConditionalGeneration,
        "auto": AutoModel,
        "llama": AutoModelForCausalLM,
        "llava-1.6": AutoModelForPreTraining
    }
    
    model, processor = None, None
    if "openai" not in model_family:
        set_seed(seed)
        model = MODEL_LOADER_DICT[model_family].from_pretrained(
            model_path,
            torch_dtype = torch.float16,
            low_cpu_mem_usage = low_cpu_mem_usage
        ).to(device)
        processor = AutoProcessor.from_pretrained(model_path)
        
    print(f"Loaded {model_path}")
    
    return model, processor


# Model, Processor

In [25]:
model, processor = load_model("llava-hf/vip-llava-13b-hf", "vip_llava", 1)

Loading checkpoint shards: 100%|██████████| 6/6 [00:00<00:00,  8.11it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded llava-hf/vip-llava-13b-hf


# Prompt

In [111]:
base_prompt = """
Generate questions that involves complex reasoning with an equal mix of 'WHAT,' 'WHERE', and 'WHO'. For each question, generate a short factoid answer and long answer that contains the reason of short answer pick.
    
What are {number} possible questions-answers about the image?

Present the question-answer pairs in format:
1. <QUESTION-1>
S. <SHORT-ANSWER-1>
L. <LONG-ANSWER-1>
<<line-break>>
"""

vit_prompt = """
Generate {number} questions that involves complex reasoning using about {name} object
in the {color} bounding box. 

Present the question-answer-reasons in format:
1. <QUESTION>
S. <SHORT-ANSWER>
L. <LONG-ANSWER>
"""

vit_base_prompt = """
Generate a question that involves complex reasoning using about {name} object in the bounding box. 
For each question, generate a short factoid answer and long answer that contains the reason of short answer pick.

Present the question-answer-reasons in format:
{number}. <your-question>
S. <your-short-answer>
L. <your-long-answer>
"""

# Image, Graph, Runner Config

In [112]:
IMG_ID = 1159835
IMG_PATH = f"./dataset/img/{IMG_ID}.jpg"

with open("./dataset/feat/sceneGraphs.json") as json_file:
    SCENE_GRAPH = json.load(json_file)

runner_config = {
    "pair_num": 3,
    "is_multistep": 0
}

In [113]:
inference_runner = nonvis_inference_runner

primary_out, primary_sec, inter_out, inter_sec, meta = inference_runner(
    model=model,
    processor=processor,
    prompt_primary=vit_base_prompt,
    prompt_inter="",
    img_path=IMG_PATH,
    scene_graph=SCENE_GRAPH,
    runner_config=runner_config,
)

nonvis_inference_runner 3


In [117]:
def save_annotated_img(tensor, path):
    img_pil = torchvision.transforms.ToPILImage()(tensor)
    img_pil.save(path)

In [126]:
from tqdm import tqdm
out = ""

COL = [i for i in range(len(names))]

for obj_i in tqdm(COL):
    vit_out = inference_hf(
        model, processor, 
        vit_prompt.format(
            number = 1,
            name = names[obj_i], 
            color = colors[obj_i]
        ),
        img_raw = img
    )
    out += vit_out + "------------------------------------\n"

100%|██████████| 7/7 [00:27<00:00,  3.94s/it]


In [127]:
print(out)

1. What is the purpose of the door within red bounding box?
S. To provide access to the outside.
L. The door within red bounding box is a traditional wooden door that likely leads to a porch or deck, providing a means for people to enter and exit the house.
------------------------------------
1. What is the significance of the ornament within the orange bounding box on the Christmas tree within the red rectangle?
S. It is a decoration.
L. The ornament within the orange bounding box is a decorative item that is commonly used to adorn Christmas trees within the red rectangle during the holiday season. It adds a festive touch to the tree and is often chosen for its color, shape, or theme.
------------------------------------
1. What is the name of the movie playing on the television within blue bounding box?
S. Star Wars
L. The movie playing on the television within blue bounding box is Star Wars, as indicated by the visible logo and the spacecraft visible on the screen.
----------------