### Testing

In [None]:
import pandas as pd

# Paths to the two CSV files
csv_file1 = "Dataset/metadata/image_data_with_vqa.csv"
csv_file2 = "Dataset/metadata/image_data_with_vqa1.csv"

# Load the CSV files into DataFrames
df1 = pd.read_csv(csv_file1)
df2 = pd.read_csv(csv_file2)

# List to store tuples of differing image paths and a counter
differences = []
count = 0

# Check that both dataframes have the same number of rows
if len(df1) != len(df2):
    print("Warning: The two CSV files have different number of rows.")

# Loop through each row assuming the rows correspond to each other
for idx in range(min(len(df1), len(df2))):
    path1 = df1.iloc[idx]['image_path']
    path2 = df2.iloc[idx]['image_path']
    if path1 != path2:
        count += 1
        differences.append((path1, path2))

print(f"Count of differing image paths: {count}")
print("First 5 differences:")
print(differences[:5])

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
img = mpimg.imread('Dataset/final_dataset/10496adb.jpg')
plt.axis('off')
imgplot = plt.imshow(img)

In [None]:
import os

folder_path = "Dataset/final_dataset"
image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".gif"}
all_files = os.listdir(folder_path)
image_files = [f for f in all_files if os.path.splitext(f)[1].lower() in image_extensions]
print(f"Total images in {folder_path}: {len(image_files)}")

Total images in Dataset/final_dataset: 19795


In [16]:
df = pd.read_csv(
    "Dataset/metadata/image_data_with_vqa1.csv"
)

# df["vqa_response"] = df["vqa_response"].apply(ast.literal_eval)
# df = df.explode("vqa_response").reset_index(drop=True)
# df[["question","answer"]] = pd.DataFrame(df["vqa_response"].tolist(), index=df.index)
# drop missing images
exists = df["image_path"].apply(os.path.exists)
print(f"Skipping {(~exists).sum()} missing images")

Skipping 0 missing images


### Model Pipeline 

#### Importing Libraries

In [None]:

# ──────────────────────────────────────────────────────────────────────────────
# 1. Install dependencies
# ──────────────────────────────────────────────────────────────────────────────

! pip install -q transformers torch evaluate scikit-learn pillow

In [None]:

# ──────────────────────────────────────────────────────────────────────────────
# 2. Imports & Environment
# ──────────────────────────────────────────────────────────────────────────────

import os
import ast
import pandas as pd
from PIL import Image
import torch
from sklearn.model_selection import train_test_split  
import evaluate                                      
from transformers import Blip2Processor, Blip2ForConditionalGeneration




#### Pre-processing 

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 3. Load & Preprocess Dataset
# ──────────────────────────────────────────────────────────────────────────────


CSV_PATH = "/Datset/metadata/image_data_with_vqa.csv"
df = pd.read_csv(CSV_PATH)

# df["image_path"] = df["image_path"].str.replace(
#     r"^Dataset/final_dataset/",
#     "/kaggle/input/vqa-multimodal-sarvesh-nathan-divyam/final_dataset/final_dataset/",
#     regex=True
# )

# filter out rows with missing vqa_response
df = df[df["vqa_response"].notna()]
df["vqa_response"] = df["vqa_response"].apply(ast.literal_eval)
df = df.explode("vqa_response").reset_index(drop=True)
df[["question", "answer"]] = pd.DataFrame(df["vqa_response"].tolist(), index=df.index)

exists = df["image_path"].apply(os.path.exists)
print(f"⚠️  Skipping {(~exists).sum()} rows with missing images")
df = df[exists]
df = df[["image_path", "question", "answer"]].reset_index(drop=True)
print(f"Total examples: {len(df)}")

#### Model Building

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 4. Load BLIP-2 + FLAN-T5-XL (zero-shot) — unchanged
# ──────────────────────────────────────────────────────────────────────────────
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model     = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-flan-t5-xl",
    torch_dtype=torch.float16,
    device_map="auto"
)

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.eval().to(device)

# ──────────────────────────────────────────────────────────────────────────────
# 5. Inference over the entire dataset
# ──────────────────────────────────────────────────────────────────────────────
preds, refs = [], []
with torch.no_grad():
    for idx, row in df.iterrows():
        img = Image.open(row.image_path).convert("RGB")
        inputs = processor(images=img, text=row.question, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        generated_ids = model.generate(**inputs)
        pred = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        print('idx: ',idx)
        print('prediction: ', pred)
        preds.append(pred)
        refs.append(row.answer.strip())

# ──────────────────────────────────────────────────────────────────────────────
# 6. Compute Accuracy on full set
# ──────────────────────────────────────────────────────────────────────────────
acc = evaluate.load("accuracy")
results = acc.compute(references=refs, predictions=preds)
print(f"\n🔍 Zero-Shot Accuracy on entire dataset = {results['accuracy']:.4f}")
