In [None]:
!pip install transformers accelerate peft bitsandbytes datasets

In [None]:
from transformers import AutoProcessor, LlavaForConditionalGeneration
from tqdm import tqdm
from datasets import load_dataset, Dataset
import pandas as pd
import torch
from PIL import Image
import os
import ast

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
img = mpimg.imread('/kaggle/input/images-with-vqas/final_dataset/final_dataset/10496adb.jpg')
plt.axis('off')
imgplot = plt.imshow(img)

In [None]:
vqa_df = pd.read_csv('/kaggle/input/images-with-vqas/merged_image_data_vqa.csv')
vqa_df.head()

In [None]:
print(vqa_df['vqa_response'].notna())

In [None]:
# # detect and init the TPU
# tpu = tf.distribute.cluster_resolver.TPUClusterResolver()

# # instantiate a distribution strategy
# tf.tpu.experimental.initialize_tpu_system(tpu)
# tpu_strategy = tf.distribute.TPUStrategy(tpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model_id = "llava-hf/bakLlava-v1-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
model.eval().to(device)

In [None]:
# VERIFY MODEL TYPE IMMEDIATELY AFTER LOADING
print(f"Type of 'model' after loading: {type(model)}")
assert isinstance(model, LlavaForConditionalGeneration), "Model is not a LlavaForConditionalGeneration instance!"

In [None]:
import time
# For storing results
predictions = []
refs = []
count = 0
start_time = time.time()
TIME_LIMIT = 42000


with torch.no_grad():  
    for idx, row in tqdm(vqa_df.iterrows(), total=len(vqa_df)):
        elapsed_time = time.time()
        if elapsed_time - start_time > TIME_LIMIT:
            print(f"\nTime limit of {TIME_LIMIT} seconds exceeded. Aborting loop at index {idx}.")
            print(f'Number of successfully run images - {idx}')
            break
        image_path = os.path.join(
            "/kaggle/input/images-with-vqas/final_dataset/final_dataset",
            row["image_path"].replace("Dataset/final_dataset/", "")
        )
    
        try:
            image = Image.open(image_path).convert("RGB")
        except FileNotFoundError:
            print(f"Warning: Image not found at {image_path}, skipping row {idx}")
            predictions.append("IMAGE_NOT_FOUND")
            refs.append(row["vqa_response"].strip() if pd.notna(row["vqa_response"]) else "")
            # count += 1
            # if count == 5:
            #     break
            continue
        except Exception as e:
            print(f"Error loading image {image_path}: {e}, skipping row {idx}")
            predictions.append("IMAGE_ERROR")
            refs.append(row["vqa_response"].strip() if pd.notna(row["vqa_response"]) else "")
            # count += 1
            # if count == 5:
            #     break
            continue
    
        try:
            vqa_pairs = ast.literal_eval(row["vqa_response"])
        except Exception as e:
            print(f"Failed to parse vqa_response: {e}, skipping row {idx}")
            predictions.append("PARSE_ERROR")
            refs.append(row["vqa_response"].strip() if pd.notna(row["vqa_response"]) else "")
            # count += 1
            # if count == 5:
            #     break
            continue
    
        for q, gt_answer in vqa_pairs:
            prompt = f"<image>\nBased on the image, answer the following question with a single word. Question: {q} Answer:"
    
            # Process both text and image
            inputs = processor(text=prompt, images=image, return_tensors="pt")
    
            # Move tensors to the correct device and dtype
            inputs = {k: v.to("cuda") for k, v in inputs.items()}  # Move to GPU first
            # Selectively convert non-index tensors to float16
            for k in inputs:
                if k != "input_ids" and k != "attention_mask":  # Keep input_ids and attention_mask as long/int
                    inputs[k] = inputs[k].to(torch.float16)
    
            # Generate output
            generated_ids = model.generate(**inputs, max_new_tokens=10)
            pred_full = processor.batch_decode(generated_ids, skip_special_tokens=True)
            # print("Raw Prediction",pred_full)
    
            pred_full = pred_full[0].strip().split(':')
            ans = pred_full[-1]
    
            # Debug raw output
            print(f"\nIdx: {idx}")
            print(f"Original Question: {q}")
            print(f"Prompt Used: '{prompt}'")
            print(f"Full Prediction: '{ans}'")
    
            # Clean up the prediction to extract a single word
            words = ans.split()
            pred = words[0].rstrip('.,;:!?') if words else ""
    
            print(f"Processed Prediction (Single Word Attempt): '{pred}'")
            print(f"Ground Truth: {gt_answer}")
            print(f"{'-'*50}")
    
            predictions.append(pred)
            refs.append(gt_answer.strip() if pd.notna(gt_answer) else "")
            # count += 1
            # if count == 5:  # Limiting to 5 for the example
            #     print("\nReached count limit (5). Breaking loop.")
            #     break
    
        # if count == 5:
        #     break

In [None]:
!pip install evaluate bert-score rouge-score rapidfuzz sentence-transformers

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import evaluate
from bert_score import BERTScorer
from rouge_score import rouge_scorer
from rapidfuzz.distance import Levenshtein
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd

# Normalize case
preds_l = [p.lower() for p in predictions]
refs_l = [r.lower() for r in refs]

# Compute exact-match binary metrics
y_pred_bin = [int(p == r) for p, r in zip(preds_l, refs_l)]
y_true_bin = [1] * len(refs)

acc = accuracy_score(y_true_bin, y_pred_bin)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true_bin, y_pred_bin, average="binary", zero_division=0
)

# Print metrics
print(f"Exact-match Accuracy: {acc:.3f}")
print(f"Exact-match Precision: {prec:.3f}")
print(f"Exact-match Recall:    {rec:.3f}")
print(f"Exact-match F1:        {f1:.3f}\n")

# Save predictions and ground truths to CSV
pred_ref_df = pd.DataFrame({
    "Prediction": predictions,
    "Ground_Truth": refs
})
pred_ref_df.to_csv('prediction_output.csv', index=False)

metrics_df = pd.DataFrame({
"Metric": ["Accuracy", "Precision", "Recall", "F1"],
"Value": [acc, prec, rec, f1]
})
metrics_df.to_csv('exact_metrics.csv',index=False)

In [None]:
# Use second GPU
device = "cuda:1" if torch.cuda.device_count() > 1 else "cuda:0"

# Initialize result dictionary
final_metrics = {}

# --- ROUGE ---
print("\n--- ROUGE Scores ---")
rouge_eval_scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge1_scores, rougeL_scores = [], []

for pred, ref in zip(preds_l, refs_l):
    if not pred or not ref:
        rouge1_scores.append(0.0)
        rougeL_scores.append(0.0)
        continue
    scores = rouge_eval_scorer.score(ref, pred)
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

if rouge1_scores:
    final_metrics["rouge1_f1"] = np.mean(rouge1_scores)
    final_metrics["rougeL_f1"] = np.mean(rougeL_scores)
    print(f"Average ROUGE-1 F1: {final_metrics['rouge1_f1']:.3f}")
    print(f"Average ROUGE-L F1: {final_metrics['rougeL_f1']:.3f}")
else:
    print("No ROUGE scores to compute.")

# --- BERTScore ---
print("\n--- BERTScore ---")
try:
    bert_eval_scorer = BERTScorer(lang="en", rescale_with_baseline=True, device=device)
    filtered_preds_l = [p for p, r in zip(preds_l, refs_l) if p and r]
    filtered_refs_l = [r for p, r in zip(preds_l, refs_l) if p and r]

    if filtered_preds_l and filtered_refs_l:
        P, R, F1 = bert_eval_scorer.score(filtered_preds_l, filtered_refs_l)
        final_metrics["bertscore_precision"] = P.mean().item()
        final_metrics["bertscore_recall"] = R.mean().item()
        final_metrics["bertscore_f1"] = F1.mean().item()
        print(f"Average BERTScore Precision: {final_metrics['bertscore_precision']:.3f}")
        print(f"Average BERTScore Recall:    {final_metrics['bertscore_recall']:.3f}")
        print(f"Average BERTScore F1:        {final_metrics['bertscore_f1']:.3f}")
    else:
        print("Not enough valid pairs for BERTScore.")
except Exception as e:
    print(f"Could not compute BERTScore: {e}")

# --- Levenshtein Normalized Similarity ---
print("\n--- Levenshtein Normalized Similarity ---")
lev_similarities = []
for pred, ref in zip(preds_l, refs_l):
    if not pred and not ref:
        similarity = 1.0
    elif not pred or not ref:
        similarity = 0.0
    else:
        similarity = Levenshtein.normalized_similarity(pred, ref)
    lev_similarities.append(similarity)

if lev_similarities:
    final_metrics["levenshtein_sim"] = np.mean(lev_similarities)
    print(f"Average Levenshtein Normalized Similarity: {final_metrics['levenshtein_sim']:.3f}")
else:
    print("No Levenshtein similarities to compute.")

# --- Sentence-BERT Cosine Similarity ---
print("\n--- Sentence-BERT Cosine Similarity ---")
try:
    sbert_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
    valid_pairs = [(p, r) for p, r in zip(preds_l, refs_l) if p and r]
    if valid_pairs:
        sbert_preds, sbert_refs = zip(*valid_pairs)
        embeddings_preds = sbert_model.encode(list(sbert_preds), convert_to_tensor=True)
        embeddings_refs = sbert_model.encode(list(sbert_refs), convert_to_tensor=True)
        cosine_scores = util.cos_sim(embeddings_preds, embeddings_refs)
        pairwise_sim = [cosine_scores[i, i].item() for i in range(len(valid_pairs))]
        final_metrics["sbert_cosine_sim"] = np.mean(pairwise_sim)
        print(f"Average Sentence-BERT Cosine Similarity: {final_metrics['sbert_cosine_sim']:.3f}")
    else:
        print("Not enough valid pairs for Sentence-BERT similarity.")
except Exception as e:
    print(f"Could not compute Sentence-BERT similarity: {e}")

# Save final metrics to CSV
df_metrics = pd.DataFrame([final_metrics])
df_metrics.to_csv("alternate_metrics.csv", index=False)

# Print summary
print("\n--- Final Metrics ---")
for k, v in final_metrics.items():
    print(f"{k}: {v:.3f}")

print("\nFinal metrics saved to 'metrics_output.csv'")