In [None]:
!pip install bitsandbytes

In [None]:
!pip install -U datasets

In [None]:
from huggingface_hub import login
login(token='token')

In [None]:
from transformers import ChameleonProcessor, ChameleonForConditionalGeneration, BitsAndBytesConfig
import torch
from PIL import Image
import requests
from datasets import load_dataset
from itertools import islice
from sklearn.metrics import accuracy_score
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import json
import re

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Dataset

In [None]:
dataset = load_dataset(
    "visual_genome",
    "question_answers_v1.0.0",
    streaming=True,
    cache_dir=None
)

streamed_train = dataset["train"]

subset = list(islice(streamed_train, 2000))

# Model

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16,
)

In [None]:
processor = ChameleonProcessor.from_pretrained('facebook/chameleon-7b')

In [None]:
model = ChameleonForConditionalGeneration.from_pretrained('facebook/chameleon-7b',
                                                          quantization_config=quantization_config,
                                                          device_map='cuda:0')

#Utils

In [None]:
def make_predictions(qa_pairs, device, batch_size=8):
  true_answers = []
  predicted_answers = []

  for i in range(0, len(qa_pairs), batch_size):
    batch = qa_pairs[i:i + batch_size]
    images = [entry[0] for entry in batch]
    questions = [entry[1] for entry in batch]
    ground_truth = [entry[2].lower() for entry in batch]

    inputs = processor(images=images, text=questions, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    inputs["pixel_values"] = inputs["pixel_values"].to(torch.float16)

    outputs = model.generate(**inputs, max_new_tokens=32)
    preds = processor.batch_decode(outputs, skip_special_tokens=True)

    clean_preds = []
    for (question, pred) in zip(questions, preds):
      q = question.replace("<image>", "")
      answer = pred.replace(q, "").strip()
      clean_preds.append(answer.lower())

    true_answers.extend(ground_truth)
    predicted_answers.extend(clean_preds)

  return true_answers, predicted_answers

In [None]:
def clean_answer(text):
  text = text.strip().split(".")[0]
  text = re.sub(r'[^\w\s]', '', text.lower())
  return text

In [None]:
def cosine_similarity_score(true_answers, predicted_answers):
  emb_model = SentenceTransformer('all-distilroberta-v1')

  true_emb = emb_model.encode(true_answers, batch_size=64, show_progress_bar=True)
  pred_emb = emb_model.encode(predicted_answers, batch_size=64, show_progress_bar=True)

  cos_sims = [cosine_similarity([t], [p])[0][0] for t, p in zip(true_emb, pred_emb)]

  mean_cos_sim = np.mean(cos_sims)
  return mean_cos_sim

# VQA

In [None]:
data = pd.read_json('scene_graphs.json')

In [None]:
data = data[data['image_id'] < 2001]
data

Unnamed: 0,relationships,image_id,objects
0,"[{'synsets': ['along.r.01'], 'predicate': 'ON'...",1,"[{'synsets': ['clock.n.01'], 'h': 339, 'object..."
1,"[{'synsets': ['wear.v.01'], 'predicate': 'wear...",2,"[{'synsets': [], 'h': 103, 'object_id': 5069, ..."
2,"[{'synsets': ['in.r.01'], 'predicate': 'in fro...",3,"[{'synsets': [], 'h': 79, 'object_id': 5091, '..."
3,"[{'synsets': ['have.v.01'], 'predicate': 'has'...",4,"[{'synsets': ['curtain.n.01'], 'h': 300, 'obje..."
4,"[{'synsets': ['along.r.01'], 'predicate': 'ON'...",5,"[{'synsets': ['floor.n.01'], 'h': 108, 'object..."
...,...,...,...
1995,"[{'synsets': ['in.r.01'], 'predicate': 'IN', '...",1996,"[{'synsets': ['bed.n.01'], 'h': 480, 'object_i..."
1996,"[{'synsets': ['along.r.01'], 'predicate': 'ON'...",1997,"[{'synsets': ['television.n.01'], 'h': 306, 'o..."
1997,"[{'synsets': ['along.r.01'], 'predicate': 'ON'...",1998,"[{'synsets': [], 'h': 75, 'object_id': 3793707..."
1998,"[{'synsets': [], 'predicate': 'ON', 'relations...",1999,"[{'synsets': [], 'h': 110, 'object_id': 432007..."


In [None]:
def get_objects_with_image_id(df):
  rows = []

  for _, row in df.iterrows():
    image_id = row['image_id']
    o = set()

    for obj in row['objects']:
      name = obj.get('names')[0]
      attributes = obj.get('attributes') or []
      if attributes:
        attrs_list = ','.join(attributes)
        o.add(f'{name} ({attrs_list})')
      else:
        o.add(f'{name}')

    rows.append({
        'image_id': image_id,
        'objects': o
    })
  return rows

In [None]:
objects = get_objects_with_image_id(data)
object_df = pd.DataFrame(objects)
object_df

Unnamed: 0,image_id,objects
0,1,"{arm (raised), shirt (grey), sign (black), pan..."
1,2,"{sidewalk (brick,white), sign (black), buildin..."
2,3,"{dividing screen, wall, desktop (curved), keyb..."
3,4,"{door (glass,sliding), seats (blue), pillow (w..."
4,5,"{floor (wooden,woods,wood,brown), window (squa..."
...,...,...
1995,1996,"{headboard (brown), wall, pillow, pillow (stri..."
1996,1997,"{drawers, wall, arms, lamp (off), desk (work),..."
1997,1998,"{counter, soap dish, soap (unused), net, liste..."
1998,1999,"{bottle, wall, water bottle, papers (white), d..."


In [None]:
qa_pairs = []

for item in subset:
  if not item.get("qas") or len(item["qas"]) == 0:
    continue

  image = item["image"]
  q = item["qas"][0]["question"]

  obj_series = object_df[object_df['image_id'] == item['image_id']]['objects']
  if not obj_series.empty:
    obj_value = obj_series.iloc[0]
    if isinstance(obj_value, list):
      obj_str = ", ".join(obj_value)
    else:
      obj_str = str(obj_value)
  else:
    obj_str = "None"

  question = (
    f"<image>\n"
    f"Objects: {obj_str}\n"
    f"Q: {q}\n"
    f"A:"
  )

  answer = item["qas"][0]["answer"].strip()
  qa_pairs.append((image, question, answer))

In [None]:
qa_pairs[0]

(<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=800x600>,
 "<image>\nObjects: {'arm (raised)', 'shirt (grey)', 'sign (black)', 'pants (gray,grey)', 'tree', 'street (sidewalk)', 'pants (black)', 'van (parked,white)', 'guy', 'headlight (off)', 'jacket (gray,grey)', 'road', 'back', 'man', 'windows', 'chin (raised)', 'trees (sparse)', 'sneakers (grey)', 'tree trunk', 'glasses', 'building (tall,brick,made of bricks)', 'shade', 'bike (parked,far away)', 'car (white,parked)', 'lamp post', 'bike (parked,far away,chained)', 'wall (grey)', 'street (clean)', 'sidewalk (brick)', 'sidewalk', 'work truck (white)', 'car', 'parking meter (orange)', 'shirt (red,orange)', 'bikes', 'clock (green,tall)', 'shoes (brown)'}\nQ: What color is the clock?\nA:",
 'Green.')

In [None]:
inputs = processor(images=qa_pairs[0][0], text=qa_pairs[0][1], return_tensors="pt").to(model.device, dtype=torch.float16)
outputs = model.generate(**inputs, max_new_tokens=50)
preds = processor.batch_decode(outputs, skip_special_tokens=True)

In [None]:
true_answers, predicted_answers = make_predictions(qa_pairs=qa_pairs, device=device, batch_size=2)

In [None]:
true_answers = [clean_answer(ans) for ans in true_answers]
predicted_answers = [clean_answer(ans) for ans in predicted_answers]

In [None]:
acc = accuracy_score(true_answers, predicted_answers)
print(f"Accuracy: {acc}")

In [None]:
mean_cos_sim = cosine_similarity_score(true_answers, predicted_answers)

In [None]:
print(f"Mean cosine similarity: {mean_cos_sim:.4f}")