In [None]:
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from peft import PeftModel
from PIL import Image
import torch
from huggingface_hub import login

def paligemma_load_with_lora():
    
    login('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')

    base_model_name="google/paligemma-3b-pt-224"
    lora_adapter_path="/kaggle/input/paligemma-svqa-fine/finetuned_paligemma/checkpoint-43500"  # <-- LoRA folder

    # 1) load the base PaliGemma
    base = PaliGemmaForConditionalGeneration.from_pretrained(
        base_model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        revision="float16",
    ).eval()

    # 2) attach fine-tuned LoRA adapter
    model = PeftModel.from_pretrained(
        base,
        lora_adapter_path,
        torch_dtype=torch.float16,
        device_map="auto",
    )
    processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224",use_fast=True)
    
    n = model.num_parameters()/1000000000
    if(n<1):
        print(f'Base Model Paramters {n*1000:.0f}M')
    else:
        print(f'Base Model Paramters {n:.1f}B')
    return model,processor

def paligemma_inference(img_path,question_text,model,processor):
    image = Image.open(img_path)
    text = f'<image> Answer the question in exactly one word:{question_text}'
    model_inputs = processor(text=text,
                             images=image,
                             return_tensors="pt").to(model.device)
    
    input_len = model_inputs["input_ids"].shape[-1]
    
    with torch.inference_mode():
        generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
        generation = generation[0][input_len:]
        decoded = processor.decode(generation, skip_special_tokens=True)

    return decoded

2025-05-13 06:47:27.082823: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747118847.255845      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747118847.308775      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
model,processor = paligemma_load_with_lora()

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/62.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/862M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/699 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/40.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.26M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

Base Model Paramters 2.9B


In [3]:
img_path = '/kaggle/input/abo-small/images/small/00/00000529.jpg'
question_text = "How many wheels are present?"

print(paligemma_inference(
            img_path=img_path,
            question_text=question_text,
            model=model,
            processor=processor
        ))

Four


In [None]:
from tqdm import tqdm
import pandas as pd

df = pd.read_csv('/kaggle/input/sqid-test-vqa/SQID_test_generated_vqa.csv')
# prepare a list to collect results
results = []

# iterate over each row, split into individual QA pairs, and predict
for _, row in tqdm(df.iterrows(), total=len(df), desc="Images"):
    img = row["image_path"]
    qs  = row["questions"].split("|")
    gts = row["answers"].split("|")

    for question, gt_answer in zip(qs, gts):

        pred = paligemma_inference(img_path = img,
               question_text = question,
                model=model,
                processor=processor
            )

        # store
        results.append({
            "ground_truth":  gt_answer,
            "prediction":    pred
        })

# turn your list of dicts into a DataFrame
result_df = pd.DataFrame(results)

# save it
result_df.to_csv("PaliGemma-Finetuned.csv", index=False)

Images: 100%|██████████| 984/984 [17:15<00:00,  1.05s/it]


In [5]:
pd.read_csv('/kaggle/working/PaliGemma-Finetuned.csv')

Unnamed: 0,ground_truth,prediction
0,Five,Six
1,Black,Gray
2,Rectangular,Rectangular
3,Metal,Metal
4,Glasses,Eyeglasses
...,...,...
4915,Tundra,Tundra
4916,Black,Black
4917,Two,Two
4918,Rectangular,Rectangle
