In [None]:
import os
import json
from tqdm import tqdm

import huggingface_hub

HF_TOKEN = "your_huggingface_token_here"
huggingface_hub.login(token=HF_TOKEN)

from transformers import (
    AutoTokenizer,

)

from vllm import LLM, SamplingParams
from datasets import load_dataset
from openai import OpenAI


os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

model_id = "Qwen/Qwen3-30B-A3B-Instruct-2507-FP8"
model = LLM(model=model_id, max_model_len=32768)
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    padding_side="left",
    truncation_side="left",
)
tokenizer.pad_token = tokenizer.eos_token

samp_params = SamplingParams(
    temperature=0.7,
    top_p=0.8,
    top_k=20,
    min_p=0.0,
    max_tokens=16384,
)
dataset_lar = load_dataset("Yeongtak/lar")


In [None]:
def get_judge_prompt(ground_truth, response):
    prompt = f"""You are an impartial judge evaluating whether a model's response correctly matches a ground-truth reference.

You will be given:
1. A question asked to the model
2. A ground-truth reference answer (GT)
3. A generated response from the model

Your task is to decide whether the generated response is Correct or Wrong.

Evaluation criteria:
- The generated response is Correct if it semantically includes the core information conveyed by the ground-truth reference.
- The wording does NOT need to match exactly. Paraphrases, rephrasings, or additional details are allowed.
- The generated response may contain extra information beyond the ground-truth reference. This is acceptable.
- The generated response is Wrong if it:
  - Fails to include the core meaning of the ground-truth reference, OR
  - Contradicts the ground-truth reference, OR
  - Provides an unrelated or vague answer that does not clearly convey the same experience or action.

Focus ONLY on whether the essential meaning of the ground-truth reference is present in the generated response.

Do NOT judge based on style, fluency, length, or factual completeness beyond the ground-truth reference.

Output:
- Respond with exactly one word: Correct or Wrong.

---

Question:
What was I doing the last time I told you about my most recent experience with the one in the new image?

Ground-truth reference:
User said "{ground_truth}"

Generated response:
{response}"""
    return prompt

In [None]:
batch_size = 20

with open("path_to_your_caption.json") as f:
    responses = json.load(f)
    
evals = []
for i in tqdm(range(0, len(dataset_lar), batch_size)):
    bs = min(batch_size, len(dataset_lar) - i)
    prompts = []
    for j in range(i, i + bs):
        judge_prompt = get_judge_prompt(dataset_lar[j]["last_action"], responses[j]["response"])
        messages = [
            {"role": "user", "content": judge_prompt},
        ]
        prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        prompts.append(prompt)

    outputs = model.generate(prompts, sampling_params=samp_params, use_tqdm=False)
    for j in range(bs):
        evals.append(outputs[j].outputs[0].text.strip())

corrects = []
for eval in evals:
    if "correct" in eval.lower():
        corrects.append(1)
    else:
        corrects.append(0)
accuracy = sum(corrects) / len(corrects)
print(f"Accuracy: {accuracy:.4f}")