In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
!wget https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-images-small.tar
!tar -xf abo-images-small.tar

--2025-05-14 07:49:16--  https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-images-small.tar
Resolving amazon-berkeley-objects.s3.amazonaws.com (amazon-berkeley-objects.s3.amazonaws.com)... 52.217.96.252, 54.231.235.25, 52.217.75.172, ...
Connecting to amazon-berkeley-objects.s3.amazonaws.com (amazon-berkeley-objects.s3.amazonaws.com)|52.217.96.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3253381120 (3.0G) [application/x-tar]
Saving to: ‘abo-images-small.tar’


2025-05-14 07:50:42 (36.3 MB/s) - ‘abo-images-small.tar’ saved [3253381120/3253381120]



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# =======================
# STEP 1: Install Required Packages
# =======================
!pip install -q bitsandbytes accelerate transformers --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m125.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m98.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [19]:
# =======================
# STEP 2: Python Script for VQA Inference
# =======================
import argparse
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch
# !rm -rf ~/.cache/huggingface/
# !rm -rf /content/unsloth_compiled_cache

# Constants for script-based compatibility
class Args:
    image_dir = "/content/images/small/"
    csv_path = "/content/drive/MyDrive/images/VQA_dataset_test/merged_listings_test.csv"
    model_name = "unsloth/Qwen2-VL-7B-Instruct-unsloth-bnb-4bit"
args = Args()

def main():
    # parser = argparse.ArgumentParser()
    # parser.add_argument('--image_dir', type=str, required=True, help='Path to image folder')
    # parser.add_argument('--csv_path', type=str, required=True, help='Path to image-metadata CSV')
    # args = parser.parse_args()

    # Load metadata CSV
    df = pd.read_csv(args.csv_path)
    df = df[:5000]  # Sample 10 rows for testing

    # Load model and processor, move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model, tokenizer = FastVisionModel.from_pretrained(
    args.model_name,
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
    )
    FastVisionModel.for_inference(model)


    generated_answers = []
    for idx, row in tqdm(df.iterrows(), total=1000):
        image_path = f"{args.image_dir}/{row['image_path']}"
        question = f"{str(row['question'])}. Answer in one word."
        try:
            image = Image.open(image_path).convert("RGB")
            messages = [
                    {"role": "user", "content": [
                        {"type": "image"},
                        {"type": "text", "text": question}
                    ]}
                ]
            input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
            inputs = tokenizer(
                    image,
                    input_text,
                    add_special_tokens = False,
                    return_tensors = "pt",
                ).to(device)
            generated_answer = model.generate(**inputs, max_length=2000)
            answer = tokenizer.decode(generated_answer[0], skip_special_tokens=True)


        except Exception as e:
            print(f"Error processing image {image_path}: {e}")
            answer = "error"
        # Ensure answer is one word and in English (basic post-processing)
        answer = str(answer).split('assistant\n')[-1].lower()
        generated_answers.append(answer)

    df["generated_answer"] = generated_answers
    df.to_csv("results_QWEN7B.csv", index=False)

if __name__ == "__main__":
    main()

==((====))==  Unsloth 2025.5.2: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


1884it [11:08,  3.77it/s]

Error processing image /content/images/small//4f/4f7c30f0.jpg: height:27 and width:256 must be larger than factor:28


2062it [13:57,  4.02it/s]

Error processing image /content/images/small//5d/5d626cd9.jpg: height:27 and width:256 must be larger than factor:28


2325it [15:11,  4.06it/s]

Error processing image /content/images/small//c0/c081c885.jpg: height:22 and width:256 must be larger than factor:28


4233it [26:14,  1.20it/s]

Error processing image /content/images/small//e1/e167d5c9.jpg: height:22 and width:256 must be larger than factor:28


5000it [29:49,  2.79it/s]
