In [5]:
import argparse
import os
import json
import random
import re
import torch
import numpy as np
from tqdm import tqdm
import shortuuid
from glob import glob 
import pandas as pd 

from cambrian.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from cambrian.conversation import conv_templates, SeparatorStyle
from cambrian.model.builder import load_pretrained_model
from cambrian.utils import disable_torch_init
from cambrian.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from torch.utils.data import Dataset, DataLoader

from PIL import Image
import math



In [8]:

# cambrian-phi3-3b
# conv_mode = "phi3"

# cambrian-8b
#conv_mode = "llama_3" 

# cambrian-34b
#conv_mode = "chatml_direct"

# cambrian-13b
conv_mode = "vicuna_v1"

def process(image, question, tokenizer, image_processor, model_config):
    qs = question

    if model_config.mm_use_im_start_end:
        qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
    else:
        qs = DEFAULT_IMAGE_TOKEN + '\n' + qs

    conv = conv_templates[conv_mode].copy()
    conv.append_message(conv.roles[0], qs)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()
    
    image_size = [image.size]
    image_tensor = process_images([image], image_processor, model_config)

    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

    return input_ids, image_tensor, image_size, prompt

import torch
import numpy as np
import random
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

model_path = os.path.expanduser("nyu-visionx/cambrian-13b")
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)

temperature = 0

00:06:47 [32mbuilder.py:119 [I][0m → Loading Cambrian from nyu-visionx/cambrian-13b




00:06:49 [32mbuilder.py:105 [I][0m → Loading **SigLIP CLIP** Vision Tower: siglip/CLIP-ViT-SO400M-14-384-interp576
00:06:49 [33mbase_encoder.py:44 [W][0m → Unfreezing MM Vision Tower: False
00:06:49 [32mbuilder.py:99 [I][0m → Loading **OpenAI CLIP** Vision Tower: openai/clip-vit-large-patch14-336-interp576
00:06:49 [33mbase_encoder.py:44 [W][0m → Unfreezing MM Vision Tower: False
00:06:49 [32mbuilder.py:116 [I][0m → Loading **DINO Vision Tower: facebook/dinov2-giant-res378-interp576
00:06:49 [33mbase_encoder.py:44 [W][0m → Unfreezing MM Vision Tower: False
00:06:49 [32mbuilder.py:111 [I][0m → Loading **ConvNeXt CLIP** Vision Tower: clip-convnext-XXL-multi-stage-interp9216
00:06:49 [33mbase_encoder.py:44 [W][0m → Unfreezing MM Vision Tower: False
00:06:52 [32mmodeling.py:799 [I][0m → We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards: 100%|██████████| 11/11 [02:43<00:00, 14.88s/it]


00:09:36 [32mfactory.py:212 [I][0m → Loaded hf-hub:timm/ViT-SO400M-14-SigLIP-384 model config.
00:09:50 [32mfactory.py:299 [I][0m → Loading pretrained hf-hub:timm/ViT-SO400M-14-SigLIP-384 weights (/home/mwf62/.cache/huggingface/hub/models--timm--ViT-SO400M-14-SigLIP-384/snapshots/ac16108d567c4389e6cd2b11c9b8585f7474435b/open_clip_pytorch_model.bin).
00:09:55 [32mmodeling.py:799 [I][0m → We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
00:10:20 [33mdino_encoder.py:89 [W][0m → Overriding DinoVisionTower image size of 518 with 378
00:10:20 [32mdino_encoder.py:98 [I][0m → Dino Vision Processor: BitImageProcessor {
  "crop_size": {
    "height": 378,
    "width": 378
  },
  "do_center_crop": true,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  

In [7]:
anl_set_jun16_w_extra_from_original = glob("../../datasets/anl_set_jun16_w_extra_from_original/*/*/*.jpg")
print(len(anl_set_jun16_w_extra_from_original))
# turn into dataframe 
anl_set_jun16_w_extra_from_original_df = pd.DataFrame(anl_set_jun16_w_extra_from_original, columns=["image_path"])

anl_set_jun16_w_extra_from_original_df["split"] = anl_set_jun16_w_extra_from_original_df["image_path"].apply(lambda x: x.split("/")[-3])
anl_set_jun16_w_extra_from_original_df["class"] = anl_set_jun16_w_extra_from_original_df["image_path"].apply(lambda x: x.split("/")[-2])

anl_set_jun16_w_extra_from_original_df["q0"] = "Does this image show a flooded street?"
anl_set_jun16_w_extra_from_original_df["q1"] = "Does this image show more than a foot of standing water?"
anl_set_jun16_w_extra_from_original_df["q2"] = "Is the street in this image flooded?"
anl_set_jun16_w_extra_from_original_df["q3"] = "Could a car drive through the water in this image?"
anl_set_jun16_w_extra_from_original_df["q4"] = "Does this image show a visible street?"
anl_set_jun16_w_extra_from_original_df["q5"] = "Is there any visible street in this image?"
anl_set_jun16_w_extra_from_original_df["q6"] = "Is the view from windshield in this image too obstructed?"


anl_set_jun16_w_extra_from_original_df 



5246


Unnamed: 0,image_path,split,class,q0,q1,q2
0,../../datasets/anl_set_jun16_w_extra_from_orig...,train,drivable,Can you safely drive through the water in this...,Does this image show more than a foot of stand...,Is the street in this image flooded?
1,../../datasets/anl_set_jun16_w_extra_from_orig...,train,drivable,Can you safely drive through the water in this...,Does this image show more than a foot of stand...,Is the street in this image flooded?
2,../../datasets/anl_set_jun16_w_extra_from_orig...,train,drivable,Can you safely drive through the water in this...,Does this image show more than a foot of stand...,Is the street in this image flooded?
3,../../datasets/anl_set_jun16_w_extra_from_orig...,train,drivable,Can you safely drive through the water in this...,Does this image show more than a foot of stand...,Is the street in this image flooded?
4,../../datasets/anl_set_jun16_w_extra_from_orig...,train,drivable,Can you safely drive through the water in this...,Does this image show more than a foot of stand...,Is the street in this image flooded?
...,...,...,...,...,...,...
5241,../../datasets/anl_set_jun16_w_extra_from_orig...,test,drivable,Can you safely drive through the water in this...,Does this image show more than a foot of stand...,Is the street in this image flooded?
5242,../../datasets/anl_set_jun16_w_extra_from_orig...,test,drivable,Can you safely drive through the water in this...,Does this image show more than a foot of stand...,Is the street in this image flooded?
5243,../../datasets/anl_set_jun16_w_extra_from_orig...,test,drivable,Can you safely drive through the water in this...,Does this image show more than a foot of stand...,Is the street in this image flooded?
5244,../../datasets/anl_set_jun16_w_extra_from_orig...,test,drivable,Can you safely drive through the water in this...,Does this image show more than a foot of stand...,Is the street in this image flooded?


In [9]:
for index, row in tqdm(anl_set_jun16_w_extra_from_original_df.iterrows()):

    for i in range(3):

        image_path = row["image_path"]
        image = Image.open(image_path).convert('RGB')
        question = row["q" + str(i)]

        input_ids, image_tensor, image_sizes, prompt = process(image, question, tokenizer, image_processor, model.config)
        input_ids = input_ids.to(device='cuda', non_blocking=True)
        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=image_tensor,
                image_sizes=image_sizes,
                do_sample=True if temperature > 0 else False,
                temperature=temperature,
                num_beams=1,
                max_new_tokens=512,
                use_cache=True)

        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()

        anl_set_jun16_w_extra_from_original_df.loc[index, "response_" + str(i)] = outputs

        # write to csv in case of crash
        anl_set_jun16_w_extra_from_original_df.to_csv("anl_set_jun16_w_extra_from_original_df.csv")

9it [01:21,  9.11s/it]


KeyboardInterrupt: 

: 