In [None]:
import json
import os
from prompts import *
from utils.chat_api import *

In [None]:
def verify_qa_list(qa_list, dataset_with_agent_answer_verified):
    bs = 100
    try:
        with open(dataset_with_agent_answer_verified, "r") as f:
            sample_count = len(f.readlines())
    except Exception as e:
        print(f"Error reading dataset_with_agent_answer_verified: {dataset_with_agent_answer_verified}")
        print(e)
        sample_count = 0
    for i in range(sample_count, len(qa_list), bs):
        qa_list_batch = qa_list[i:i+bs]
        inputs = [
            [
                {
                    "type": "text",
                    "content": json.dumps({
                        "question": qa["question"],
                        "ground_truth_answer": qa["answer"],
                        "agent_answer": qa["agent_answer"],
                    }),
                },
                {
                    "type": "text",
                    "content": prompt_agent_verify_answer,
                },
                {
                    "type": "text",
                    "content": "Now answer if the answer from the baseline is correct or not:",
                },            
            ] for qa in qa_list_batch
        ]
        messages = [generate_messages(input) for input in inputs]
        model = "gpt-4o-2024-08-06"
        responses = parallel_get_response(model, messages)

        verify_results = responses[0]
        for qa, verify_result in zip(qa_list_batch, verify_results):
            qa["verify_result"] = verify_result
        
        with open(dataset_with_agent_answer_verified, "a") as f:
            for qa in qa_list_batch:
                f.write(json.dumps(qa) + "\n")

In [None]:
qa_list = []
dataset_with_agent_answer = "data/annotations/results/0416/baseline_blindly_answers.jsonl"
dataset_with_agent_answer_verified = "data/annotations/results/0416/baseline_blindly_answers_verified.jsonl"

with open(dataset_with_agent_answer, "r") as f:
    for line in f:
        qa = json.loads(line)
        qa_list.append(qa)

qa_list = verify_qa_list(qa_list, dataset_with_agent_answer_verified)

In [7]:
# calculate accuracy
dataset_with_agent_answer_verified = "data/annotations/results/0416/baseline_blindly_answers_verified.jsonl"
total = 0
correct = 0

with open(dataset_with_agent_answer_verified, "r") as f:
    for line in f:
        qa = json.loads(line)
        total += 1
        if qa["verify_result"].lower().startswith("yes"):
            correct += 1

print(f"Accuracy: {correct / total}")

Accuracy: 0.22420634920634921


In [23]:
# calculate pass@k
k = 1

filtered_questions = []
with open("data/annotations/results/0416/baseline_blindly_answers_verified.jsonl", "r") as f:
    for line in f:
        sample = json.loads(line)
        if sample["verify_result"].lower().startswith("yes"):
            filtered_questions.append(sample["question"])

print(len(filtered_questions))

dataset_with_agent_answer_verified = (
    "data/annotations/results/0415/small_test_with_agent_answer_verified_{round}.jsonl"
)

results = [dataset_with_agent_answer_verified.format(round=i) for i in range(k)]
result_files = [open(result, "r") for result in results]

total = 0
correct = 0

# Read all QA pairs line by line across k files
while True:
    qa_attempts = []
    # Try to read next line from each file
    for f in result_files:
        line = f.readline()
        if line:
            qa = json.loads(line)
            question = qa["question"]
            qa_attempts.append(qa["verify_result"].lower().startswith("yes"))

    # If we've reached end of files, break
    if not qa_attempts:
        break

    # Count as correct if any attempt was successful
    if question in filtered_questions:
        total += 1
        if any(qa_attempts):
            correct += 1

# Close all files
for f in result_files:
    f.close()

print(f"Pass@{k}: {correct / total}")

113
Pass@1: 0.7168141592920354


In [24]:
with open("data/annotations/results/0416/gemini_eval.json", "r") as f:
    videos = json.load(f)

total = 0
correct = 0

for video in videos["video_results"]:
    qas = video["qa_results"]
    for qa in qas:
        if qa["question"] in filtered_questions:
            total += 1
            if qa["is_correct"]:
                correct += 1

print(correct/total)

0.8230088495575221
