In [17]:
from PIL import Image
import requests
import bitsandbytes
import multiprocessing as mp
from tqdm import tqdm
import glob
import os
from transformers import (
    AutoProcessor,
    LlavaConfig,
    LlavaForConditionalGeneration,
    is_torch_available,
    is_vision_available,
)
from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device


In [2]:
model_id = "llava-hf/llava-1.5-7b-hf"
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
processor = AutoProcessor.from_pretrained(model_id, pad_token="<pad>")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.15it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def open_image(path):
    try:
        return Image.open(path), path
    except Exception as e:
        print(f"Error opening image {path}: {e}")
        return None

image_paths = glob.glob('/mnt/SSD1/Niantic/data/test/s00525/seq1/*.jpg')

batch_size = 10
all_images = []
for image_batch in tqdm(chunks(image_paths, batch_size), total=len(image_paths)//batch_size):
    with mp.Pool(mp.cpu_count()) as pool:
        images = pool.map(open_image, image_batch)
    images = [img for img in images if img is not None]
    all_images.append(images)


58it [00:12,  4.67it/s]                                                                                                                                       


In [5]:
len(all_images)

58

In [11]:
for image_batch_tuple in all_images[:1]:    
    prompts = [
                "USER: <image>\Write a descriptive caption for the image, highlighting the visible elements and key features.\nASSISTANT:",
            ] * len(image_batch_tuple)
    images = [image_tuple[0] for image_tuple in image_batch_tuple]
    inputs = processor(prompts, images, return_tensors="pt", padding=True)
    output = model.generate(**inputs, max_new_tokens=1024)
    caption_batch = processor.batch_decode(output, skip_special_tokens=True)
    

In [21]:
caption_path = [image_tuple[1].replace("data", "caption").rsplit(".", 1)[0] + ".txt" for image_tuple in image_batch_tuple]
for i ,caption in enumerate(caption_batch):
    os.makedirs(os.path.dirname(caption_path[i]), exist_ok=True)
    with open(caption_path[i], "w") as f:
        f.write(caption.split("\nASSISTANT: ")[-1])

['USER:  \\Write a descriptive caption for the image, highlighting the visible elements and key features.\nASSISTANT: A large gray stone sculpture is situated in a lush green garden, surrounded by bushes and trees. The sculpture is positioned in the middle of the garden, drawing attention to its artistic design. The garden is filled with various vehicles, including cars and trucks, parked around the area. The combination of the sculpture and the parked vehicles creates a unique and visually interesting scene.',
 'USER:  \\Write a descriptive caption for the image, highlighting the visible elements and key features.\nASSISTANT: The image features a large stone sculpture in the middle of a lush green garden. The sculpture is situated in the center of the scene, surrounded by a variety of green plants. The garden is filled with numerous cars parked around it, indicating that it might be a popular spot for visitors. The combination of the stone sculpture and the vibrant garden creates a se

In [2]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
lst = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
n = 3
k = chunks(lst,n)

In [3]:
for i in k:
    print(i)

[1, 2, 3]
[4, 5, 6]
[7, 8, 9]
[10]


In [6]:
import pandas as pd
pd.read_csv("csv/scenes_part_7.csv")

Unnamed: 0,/mnt/SSD1/Niantic/data/train/s00164
0,/mnt/SSD1/Niantic/data/train/s00165
1,/mnt/SSD1/Niantic/data/train/s00166
2,/mnt/SSD1/Niantic/data/train/s00167
3,/mnt/SSD1/Niantic/data/train/s00168
4,/mnt/SSD1/Niantic/data/val/s00494
...,...
64,/mnt/SSD1/Niantic/data/val/s00520
65,/mnt/SSD1/Niantic/data/val/s00521
66,/mnt/SSD1/Niantic/data/val/s00522
67,/mnt/SSD1/Niantic/data/val/s00523


In [7]:
import glob
train_scenes = glob.glob("/mnt/SSD1/Niantic/data/train/*")
val_scenes = glob.glob("/mnt/SSD1/Niantic/data/val/*")


scenes = train_scenes + val_scenes

In [8]:
len(scenes)

525