### BLIP 

In [3]:
import os
import csv
from PIL import Image
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration

SYSTEM_PROMPT = (
        "You are an advanced AI trained to generate highly detailed descriptions of images. "
        "Use multi-step reasoning to identify all objects in the image, their relative positions, "
        "and distances. Describe the scene as if explaining it to someone who cannot see it. "
        "Be precise and include relevant spatial relationships."
        # "<image>"
    )

MODEL_HF = "Salesforce/blip-image-captioning-base"
CACHE_DIR = "cache"

torch.cuda.empty_cache()

def generate_description(image_path, model, processor, device, question=SYSTEM_PROMPT):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, question=SYSTEM_PROMPT, return_tensors="pt").to(device)
    
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=100)
    
    description = processor.batch_decode(output, skip_special_tokens=True)[0]
    return description.strip()

def process_images(root_folder="img/bathroom", output_csv="data/ground_truth-"+MODEL_HF.split("/")[1]+".csv"):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    processor = BlipProcessor.from_pretrained(MODEL_HF, cache_dir=CACHE_DIR)
    model = BlipForConditionalGeneration.from_pretrained(MODEL_HF).to(device).eval()
    
    data = []
    
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if file.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
                image_path = os.path.join(subdir, file)
                try:
                    description = generate_description(image_path, model, processor, device)
                    data.append([image_path, description])
                    print(f"Processed: {image_path}")
                except Exception as e:
                    print(f"Error processing {image_path}: {e}")
    
    with open(output_csv, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["path", "description"])
        writer.writerows(data)
    
    print(f"Descriptions saved to {output_csv}")

if __name__ == "__main__":
    process_images()


Processed: img/bathroom/IMG_1559.jpg
Processed: img/bathroom/dscn2018.jpg
Processed: img/bathroom/p8270261.jpg
Processed: img/bathroom/IMG_3211.jpg
Processed: img/bathroom/interior016.jpg
Processed: img/bathroom/b8.jpg
Processed: img/bathroom/indoor_0196.jpg
Processed: img/bathroom/indoor_0226.jpg
Processed: img/bathroom/indoor_0034.jpg
Processed: img/bathroom/room508.jpg
Processed: img/bathroom/IMG_1341.jpg
Processed: img/bathroom/room471.jpg
Processed: img/bathroom/room315.jpg
Processed: img/bathroom/IMG_9647.jpg
Processed: img/bathroom/IMG_2437.jpg
Processed: img/bathroom/dublin___apartamento___29_03_2007_006.jpg
Processed: img/bathroom/IMG_2439.jpg
Processed: img/bathroom/room318.jpg
Processed: img/bathroom/bathroom35.jpg
Processed: img/bathroom/b12.jpg
Processed: img/bathroom/n190011.jpg
Processed: img/bathroom/indoor_0566.jpg
Processed: img/bathroom/IMG_0076.jpg
Processed: img/bathroom/indoor_0124.jpg
Processed: img/bathroom/indoor_0391.jpg
Processed: img/bathroom/img_0018.jpg
Pr

In [None]:
from ground_truth_gen import load_model, save_results

model_name = "Salesforce/blip-image-captioning-base"
processor, model = load_model(model_name, BlipProcessor, BlipForConditionalGeneration)  # Adjust if BLIP needs different processor/model classes

generation_kwargs = {
    "max_new_tokens": 512,
    "num_beams": 5,
    "do_sample": False
}

image_dir = "img/test_blip"
output_csv = "data/ground_truth_blip-base.csv"

results = process_images(image_dir, model, processor, generation_kwargs, output_csv)
save_results(results, output_csv)

### LLAVA-1.5-7B

In [None]:
import os
import csv
from PIL import Image
from transformers import LlavaProcessor, LlavaForConditionalGeneration, AutoTokenizer
import torch


MODEL_HF = "llava-hf/llava-1.5-7b-hf" # "llava-hf/llava-1.5-7b-hf"
CACHE_DIR = "cache"

torch.cuda.empty_cache()

def generate_description(image_path, model, processor):
    system_prompt = (
        "<<SYS>>\n"
        "Please describe the image shown regarding spatial relationships between objects and their colors in great detail:"
        "<image>"
        "<<SYS>>\n"
    )

    image = Image.open(image_path).convert("RGB")

    device = model.device  # Get model's device
    inputs = processor(images=image, text=system_prompt, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=100)

    description = processor.batch_decode(output, skip_special_tokens=True)[0]

    if "<<SYS>>" in description:
        description = description.split("<<SYS>>")[-1].strip()

    return description.strip()


def process_images(root_folder="img/test", output_csv="data/ground_truth-"+MODEL_HF.split("/")[1]+".csv"):
    # tokenizer = AutoTokenizer.from_pretrained(MODEL_HF, cache_dir=CACHE_DIR)
    processor = LlavaProcessor.from_pretrained(MODEL_HF, cache_dir=CACHE_DIR)
    model = LlavaForConditionalGeneration.from_pretrained(MODEL_HF).to("cuda" if torch.cuda.is_available() else "cpu")
    model = model.eval()
    data = []
    
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if file.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
                image_path = os.path.join(subdir, file)
                try:
                    description = generate_description(image_path, model, processor)
                    data.append([image_path, description])
                    print(f"Processed: {image_path}")
                except Exception as e:
                    print(f"Error processing {image_path}: {e}")
    
    with open(output_csv, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["path", "description"])
        writer.writerows(data)
    
    print(f"Descriptions saved to {output_csv}")

if __name__ == "__main__":
    process_images()


In [None]:
from ground_truth_gen import load_model, save_results

model_name = "llava-hf/llava-1.5-7b-hf"
processor, model = load_model(model_name, LlavaProcessor, LlavaForConditionalGeneration)

generation_kwargs = {
    "max_new_tokens": 1024,
    "num_beams": 1,
    "do_sample": False
}

image_dir = "img/test_llava"
output_csv = "data/ground_truth_llava-1.5-7b-hf.csv"

results = process_images(image_dir, model, processor, generation_kwargs, output_csv)
save_results(results, output_csv)

### SpaceLLAVA-13B &#9746;

In [None]:
import os
import csv
from PIL import Image
import torch
from transformers import LlavaProcessor, LlavaForConditionalGeneration, AutoFeatureExtractor


MODEL_HF = "remyxai/SpaceLLaVA"
CACHE_DIR = "cache"

def generate_description(image_path, model, processor):
    system_prompt = (
        "<<SYS>>\n"
        "You are an advanced AI trained to generate highly detailed descriptions of images.\n"
        "Use multi-step reasoning to identify all objects in the image, their relative positions, "
        "and distances. Describe the scene as if explaining it to someone who cannot see it.\n"
        "Be precise and include relevant spatial relationships.\n"
        "Describe the following image:\n"
        "<<SYS>>\n"
        "<image>"
    )

    feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_HF)

    # Preprocess the image manually using the feature extractor
    image = Image.open(image_path).convert("RGB")
    image_tensor = feature_extractor(images=image, return_tensors="pt").pixel_values

    print(f"Processed image with shape: {image_tensor.shape}")
    device = model.device  

    # Correctly process image and text inputs
    inputs = processor(images=image_tensor, text=system_prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=256)

    # Decode the generated description
    description = processor.batch_decode(output, skip_special_tokens=True)[0]
    return description.strip()

def process_images(root_folder="img/test", output_csv="data/ground_truth-"+MODEL_HF.split("/")[1]+".csv"):
    # Load processor and model correctly
    processor = LlavaProcessor.from_pretrained(MODEL_HF, cache_dir=CACHE_DIR)
    model = LlavaForConditionalGeneration.from_pretrained(MODEL_HF, cache_dir=CACHE_DIR).to("cuda" if torch.cuda.is_available() else "cpu")
    #model.eval()

    data = []
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            if file.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
                image_path = os.path.join(subdir, file)
                try:
                    description = generate_description(image_path, model, processor)
                    data.append([image_path, description])
                    print(f"Processed: {image_path}")
                except Exception as e:
                    print(f"Error processing {image_path}: {e}")

    with open(output_csv, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["path", "description"])
        writer.writerows(data)

    print(f"Descriptions saved to {output_csv}")

if __name__ == "__main__":
    process_images()


### SpaceMantis

In [20]:
import torch
from PIL import Image
from models.mllava import MLlavaProcessor, LlavaForConditionalGeneration, chat_mllava
import pandas as pd

CACHE_DIR = "cache"

# Load the model and processor
attn_implementation = None  # or "flash_attention_2"
processor = MLlavaProcessor.from_pretrained("remyxai/SpaceMantis", cache_dir=CACHE_DIR)
model = LlavaForConditionalGeneration.from_pretrained("remyxai/SpaceMantis", cache_dir=CACHE_DIR, device_map="cuda", torch_dtype=torch.float16, attn_implementation=attn_implementation)

generation_kwargs = {
    "max_new_tokens": 1024,
    "num_beams": 1,
    "do_sample": False
}

# Function to run inference
def run_inference(image_path, content):
    # Load the image
    image = Image.open(image_path).convert("RGB")
    # Convert the image to base64
    images = [image]
    # Run the inference
    response, history = chat_mllava(content, images, model, processor, **generation_kwargs)
    return response

# Load the CSV file with ground truth descriptions
csv_path = 'data/ground_truth-llava-1.5-7b-hf.csv'
df = pd.read_csv(csv_path)

# Create a list to hold the new descriptions
new_descriptions = []

# Iterate through each row of the CSV to process the images and descriptions
for index, row in df.iterrows():
    image_path = row['path']
    description = row['description']
    
    # Prepare the content for the model
    content = f"The following is a first version of the description: \n\n'{description}'.\n\n Modify the descriptions to add distances between objects and dimensions. Specify the metric you are using e.g. meters, feet."
    #content="Please describe the image shown regarding spatial relationships between objects (specifying measures) in great detail"
    # Run inference on each image
    response = run_inference(image_path, content)
    
    # Append the new description to the list
    new_descriptions.append({
        'path': image_path,
        'description': response
    })

# Convert the list of new descriptions into a DataFrame
new_df = pd.DataFrame(new_descriptions)

# Save the new descriptions to a new CSV file
new_csv_path = 'data/ground_truth_SpaceMantis-8B.csv'
new_df.to_csv(new_csv_path, index=False)

print(f"New descriptions saved to {new_csv_path}")





Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


New descriptions saved to data/ground_truth_SpaceMantis-8B.csv


#### W/ Multi-step reasoning

In [None]:
import torch
from PIL import Image
from models.mllava import MLlavaProcessor, LlavaForConditionalGeneration, chat_mllava
import pandas as pd
from tqdm import tqdm  # Import tqdm for progress bar

CACHE_DIR = "cache"

# Load the model and processor
attn_implementation = None  # or "flash_attention_2"
processor = MLlavaProcessor.from_pretrained("remyxai/SpaceMantis", cache_dir=CACHE_DIR)
model = LlavaForConditionalGeneration.from_pretrained("remyxai/SpaceMantis", cache_dir=CACHE_DIR, device_map="cuda", torch_dtype=torch.float16, attn_implementation=attn_implementation)

generation_kwargs = {
    "max_new_tokens": 1024,
    "num_beams": 1,
    "do_sample": False
}

# Function to run multi-step inference
def run_inference(image_path):
    # Load the image
    image = Image.open(image_path).convert("RGB")
    images = [image]

    # Step 1: Basic Description
    step1_prompt = "Describe the image in great detail. Do not make anything up and do not assume anything. Only generate useful descriptive information."
    step1_response, _ = chat_mllava(step1_prompt, images, model, processor, **generation_kwargs)

    # Step 2: Identify Objects
    step2_prompt = f"Based on the previous description: \n\n'{step1_response}'.\n\n Now, modify it by listing all distinct objects in the image, specifying colors and positions in the image."
    step2_response, _ = chat_mllava(step2_prompt, images, model, processor, **generation_kwargs)

    # Step 3: Specify Semantic Relationships
    step3_prompt = f"Based on the identified objects: \n\n'{step2_response}'.\n\n Now, modify it by describing the semantic relationships between objects (e.g., one object is on top of another, next to, behind, etc.)."
    step3_response, _ = chat_mllava(step3_prompt, images, model, processor, **generation_kwargs)

    # Step 4: Specify Exact Distances
    step4_prompt = f"Using the previous information: \n\n'{step3_response}'.\n\n Now, modify it by specifying the distances between objects in meters or feet."
    step4_response, _ = chat_mllava(step4_prompt, images, model, processor, **generation_kwargs)

    return step4_response

# Load the CSV file with ground truth descriptions
csv_path = 'data/ground_truth-llava-1.5-7b-hf.csv'
df = pd.read_csv(csv_path)

# Create a list to hold the new descriptions
new_descriptions = []

# Iterate through each row of the CSV to process the images and descriptions with a progress bar
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Images", unit="image"):
    image_path = row['path']
    
    # Run inference on each image (multi-step reasoning)
    response = run_inference(image_path)
    
    # Append the new description to the list
    new_descriptions.append({
        'path': image_path,
        'description': response
    })

# Convert the list of new descriptions into a DataFrame
new_df = pd.DataFrame(new_descriptions)

# Save the new descriptions to a new CSV file
new_csv_path = 'data/ground_truth_SpaceMantis-8B_4.csv'
new_df.to_csv(new_csv_path, index=False)

print(f"New descriptions saved to {new_csv_path}")


In [None]:
from ground_truth_gen import load_model, save_results

model_name = "remyxai/SpaceMantis"
processor, model = load_model(model_name, MLlavaProcessor, LlavaForConditionalGeneration)

generation_kwargs = {
    "max_new_tokens": 1024,
    "num_beams": 1,
    "do_sample": False
}

image_dir = "img/test" 
output_csv = "data/ground_truth_SpaceMantis-8B_4.csv"

results = process_images(image_dir, model, processor, generation_kwargs)
save_results(results, output_csv)