In [2]:
import json
import os
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
from mmagent.prompts import *
from mmagent.utils.chat_api import *

from evaluation.qa import verify_qa_list, verify_qa_list_with_reasoning

In [None]:
with open(
    "/mnt/bn/videonasi18n/ypan/Project/3_MM_Agents/evaluation/merged_file/baseline_gemini_eval.json",
    "r",
) as f:
    videos = json.load(f)

samples = []

for video in videos:
    qas = video["qa_list"]
    for qa in qas:
        sample = {
            "video_path": video["path"],
            "question": qa["question"],
            "answer": qa["answer"],
            "agent_answer": qa["gpt4o_answer"],
        }
        samples.append(sample)

print(len(samples))

with open("data/annotations/results/0418/small_test_gemini.jsonl", "w") as f:
    for sample in samples:
        f.write(json.dumps(sample) + "\n")

In [None]:
def refine_answer(sample):
    input = [
        {
            "type": "text",
            "content": prompt_refine_answer.format(
                qa_pair=json.dumps(
                    {"question": sample["question"], "answer": sample["agent_answer"]}
                )
            ),
        }
    ]
    messages = generate_messages(input)
    model = "gpt-4o-2024-11-20"
    new_answer = get_response_with_retry(model, messages)[0]
    sample["agent_answer"] = new_answer
    return sample


def refine_qa_list(qa_list):
    bs = 100
    results = []
    try:
        with ProcessPoolExecutor(max_workers=16) as executor:
            for i in range(0, len(qa_list), bs):
                qa_list_batch = qa_list[i : i + bs]
                result = list(
                    tqdm(
                        executor.map(refine_answer, qa_list_batch),
                        total=len(qa_list_batch),
                        desc=f"Refining answers {i}/{len(qa_list)}",
                    )
                )
                results.extend(result)
    except Exception as e:
        raise RuntimeError(f"Error refining qa_list batch") from e
    return results


with open(
    "data/annotations/results/full_retrieval/small_test_with_agent_answer_0.jsonl", "r"
) as f:
    qa_list = [json.loads(line) for line in f]

new_qa_list = refine_qa_list(qa_list)

with open(
    "data/annotations/results/0418/small_test_with_refined_agent_answer.jsonl", "w"
) as f:
    for qa in new_qa_list:
        f.write(json.dumps(qa) + "\n")

In [8]:
dataset_with_agent_answer_list = [
    "data/annotations/results/full_retrieval/small_test_with_agent_answer_0.jsonl",
    # "data/annotations/results/full_retrieval/small_test_with_agent_answer_1.jsonl",
    # "data/annotations/results/full_retrieval/small_test_with_agent_answer_2.jsonl"
]
dataset_with_agent_answer_verified_list = [
    "data/annotations/results/full_retrieval/small_test_with_agent_answer_verified_0.jsonl",
    # "data/annotations/results/full_retrieval/small_test_with_agent_answer_verified_1.jsonl",
    # "data/annotations/results/full_retrieval/small_test_with_agent_answer_verified_2.jsonl"
]

for dataset_with_agent_answer, dataset_with_agent_answer_verified in zip(
    dataset_with_agent_answer_list, dataset_with_agent_answer_verified_list
):
    qa_list = []

    with open(dataset_with_agent_answer, "r") as f:
        for line in f:
            qa = json.loads(line)
            qa_list.append(qa)

    qa_list = verify_qa_list(qa_list, dataset_with_agent_answer_verified)

100%|██████████| 6/6 [04:25<00:00, 44.23s/it]


In [3]:
# calculate accuracy
dataset_with_agent_answer_verified = (
    "data/annotations/results/0418/small_test_with_refined_agent_answer_verified.jsonl"
)
total = 0
correct = 0

with open(dataset_with_agent_answer_verified, "r") as f:
    for line in f:
        qa = json.loads(line)
        total += 1
        if qa["verify_result"].lower().startswith("yes"):
            correct += 1

print(f"Accuracy: {correct / total}")

Accuracy: 0.7103174603174603


In [6]:
# calculate pass@k
k = 1

# filtered_questions = []

# with open("/mnt/bn/videonasi18n/longlin.kylin/mmagent/data/annotations/results/0416/baseline_blindly_answers_verified.jsonl", "r") as f:
#     for line in f:
#         sample = json.loads(line)
#         if sample["verify_result"].lower().startswith("yes"):
#             filtered_questions.append(sample["question"])

# with open("/mnt/bn/videonasi18n/ypan/Project/3_MM_Agents/dataset/verified_small_test.json", "r") as f:
#     samples = json.load(f)
#     for sample in samples:
#         if sample["verify_result"].lower().startswith("yes") and sample["question"] not in filtered_questions:
#             filtered_questions.append(sample["question"])

filtered_questions = []

with open(
    "/mnt/bn/videonasi18n/longlin.kylin/mmagent/data/annotations/results/0416/baseline_blindly_answers_verified.jsonl",
    "r",
) as f:
    for i, line in enumerate(f):
        sample = json.loads(line)
        if sample["verify_result"].lower().startswith("yes"):
            filtered_questions.append(i)

with open(
    "/mnt/bn/videonasi18n/longlin.kylin/mmagent/data/annotations/results/0417/verified_small_test.json",
    "r",
) as f:
    samples = json.load(f)
    for i, sample in enumerate(samples):
        if sample["verify_result"].lower().startswith("yes"):
            filtered_questions.append(i)

filtered_questions = list(set(filtered_questions))

print(len(filtered_questions))

exp = "5_rounds_threshold_0_5_no_planning"
dataset_with_agent_answer_verified = (
    "data/annotations/results/{exp}/small_test_with_agent_answer_verified_{round}.jsonl"
)

results = [
    dataset_with_agent_answer_verified.format(round=i, exp=exp) for i in range(k)
]
# results = ["data/annotations/results/0418/small_test_with_refined_agent_answer_verified.jsonl"]
result_files = [open(result, "r") for result in results]

total = 0
correct = 0
filtered_count = 0
idx = 0

# Read all QA pairs line by line across k files
while True:
    qa_attempts = []
    # Try to read next line from each file
    for f in result_files:
        line = f.readline()
        if line:
            qa = json.loads(line)
            flag = qa["flag"]
            qa_attempts.append(qa["verify_result"].lower().startswith("yes"))

    # If we've reached end of files, break
    if not qa_attempts:
        break

    # Count as correct if any attempt was successful
    # if question not in filtered_questions:
    if not flag:
        total += 1
        if any(qa_attempts):
            correct += 1
    else:
        filtered_count += 1

    idx += 1

# Close all files
for f in result_files:
    f.close()

print(f"Pass@{k}: {correct / total}")
print(filtered_count)

164
Pass@1: 0.6235294117647059
164


In [5]:
# calculate pass@k
k = 1

# filtered_questions = []

# with open("/mnt/bn/videonasi18n/longlin.kylin/mmagent/data/annotations/results/0416/baseline_blindly_answers_verified.jsonl", "r") as f:
#     for line in f:
#         sample = json.loads(line)
#         if sample["verify_result"].lower().startswith("yes"):
#             filtered_questions.append(sample["question"])

# with open("/mnt/bn/videonasi18n/ypan/Project/3_MM_Agents/dataset/verified_small_test.json", "r") as f:
#     samples = json.load(f)
#     for sample in samples:
#         if sample["verify_result"].lower().startswith("yes") and sample["question"] not in filtered_questions:
#             filtered_questions.append(sample["question"])

filtered_questions = []

with open(
    "/mnt/bn/videonasi18n/longlin.kylin/mmagent/data/annotations/results/0416/baseline_blindly_answers_verified.jsonl",
    "r",
) as f:
    for i, line in enumerate(f):
        sample = json.loads(line)
        if sample["verify_result"].lower().startswith("yes"):
            filtered_questions.append(i)

with open(
    "/mnt/bn/videonasi18n/longlin.kylin/mmagent/data/annotations/results/0417/verified_small_test.json",
    "r",
) as f:
    samples = json.load(f)
    for i, sample in enumerate(samples):
        if sample["verify_result"].lower().startswith("yes"):
            filtered_questions.append(i)

filtered_questions = list(set(filtered_questions))

print(len(filtered_questions))

exp = "5_rounds_threshold_0_5_no_planning_qwen"
dataset_with_agent_answer_verified = "data/annotations/results/{exp}/small_test_qwen_with_agent_answer_verified_{round}.jsonl"

results = [
    dataset_with_agent_answer_verified.format(round=i, exp=exp) for i in range(k)
]
# results = ["data/annotations/results/0418/small_test_with_refined_agent_answer_verified.jsonl"]
result_files = [open(result, "r") for result in results]

total = 0
correct = 0
filtered_count = 0
idx = 0

# Read all QA pairs line by line across k files
while True:
    qa_attempts = []
    # Try to read next line from each file
    for f in result_files:
        line = f.readline()
        if line:
            qa = json.loads(line)
            flag = qa["flag"]
            qa_attempts.append(qa["verify_result"].lower().startswith("yes"))

    # If we've reached end of files, break
    if not qa_attempts:
        break

    # Count as correct if any attempt was successful
    # if question not in filtered_questions:
    if not flag:
        total += 1
        if any(qa_attempts):
            correct += 1
    else:
        filtered_count += 1

    idx += 1

# Close all files
for f in result_files:
    f.close()

print(f"Pass@{k}: {correct / total}")
print(filtered_count)

164


Pass@1: 0.6205882352941177
164


In [10]:
with open(
    "/mnt/bn/videonasi18n/ypan/Project/3_MM_Agents/evaluation/updated_prompt/moviechat_eval.json",
    "r",
) as f:
    videos = json.load(f)

filtered_questions_ = []

with open("data/annotations/archived/small_test.jsonl", "r") as f:
    for i, line in enumerate(f):
        d = json.loads(line)
        if i in filtered_questions:
            filtered_questions_.append(d["question"])

total = 0
correct = 0
idx = 0

for video in videos:
    qas = video["qa_list"]
    for qa in qas:
        # if True:
        if qa["question"] not in filtered_questions_:
            # if idx not in filtered_questions:
            total += 1
            try:
                if qa["is_correct"]:
                    correct += 1
            except:
                continue
        idx += 1

print(total)
print(correct / total)

340
0.05588235294117647


In [8]:
filtered_questions_ = []

with open("data/annotations/archived/small_test.jsonl", "r") as f:
    for i, line in enumerate(f):
        d = json.loads(line)
        if i in filtered_questions:
            filtered_questions_.append(d["question"])

total = 0
correct = 0
idx = 0

with open("data/annotations/results/0418/small_test_gemini_verified.jsonl", "r") as f:
    for line in f:
        sample = json.loads(line)
        if sample["question"] not in filtered_questions_:
            total += 1
            if sample["verify_result"].lower().startswith("yes"):
                correct += 1

print(total)
print(correct / total)

340
0.5735294117647058


In [None]:
import json
import matplotlib.pyplot as plt

samples = []
exp = "large_retrieval"

with open(
    f"data/annotations/results/{exp}/small_test_with_agent_answer_0.jsonl", "r"
) as f:
    for line in f:
        samples.append(json.loads(line))

mem_lens = []
for sample in samples:
    mem_len = 0
    for mem in sample["session"][0]:
        mem_len += len(mem)
    mem_lens.append(mem_len)

lens = [len(d["session"][0]) for d in samples]


plt.figure(figsize=(10, 6))
plt.hist(mem_lens, bins=30, edgecolor="black")
plt.xlabel("Clip Length")
plt.ylabel("Frequency")
plt.title("Distribution of Clip Lengths")
plt.grid(True, alpha=0.3)
plt.show()

plt.figure(figsize=(10, 6))
plt.hist(lens, bins=30, edgecolor="black")
plt.xlabel("Session Length")
plt.ylabel("Frequency")
plt.title("Distribution of Session Lengths")
plt.grid(True, alpha=0.3)
plt.show()

# Count frequency of each length
length_counts = {}
for l in lens:
    length_counts[l] = length_counts.get(l, 0) + 1

# Sort by length and print
for length in sorted(length_counts.keys()):
    print(f"Length {length}: {length_counts[length]} samples")

In [1]:
with open("data/annotations/sft/small_train_selected.jsonl", "r") as f:
    for line in f:
        sample = json.loads(line)
        actions = sample["session"][1]
        for action in actions:
            if (
                action["action_type"] == "search"
                and "<character_" in action["action_content"]
            ):
                print(sample["question"])
                break

NameError: name 'json' is not defined

In [2]:
import json

count = 0
with open("data/annotations/small_test.jsonl", "r") as f:
    for line in f:
        sample = json.loads(line)
        if sample["flag"]:
            count += 1
print(count)

164


In [2]:
import pandas as pd
from tqdm import tqdm
from mmagent.retrieve import verify_qa

with open("data/annotations/results/0421/gpt_evaluator.csv", "r") as f:
    df = pd.read_csv(f)

# iterate through rows in the dataframe
for index, row in tqdm(df.iterrows()):
    question = row["question"]
    gt = row["answer"]
    pred = row["model_answer"]
    result = verify_qa(question, gt, pred)
    df.loc[index, "gpt4_evaluator (w/ updated prompt)"] = (
        "TRUE" if result.lower().startswith("yes") else "FALSE"
    )

df.to_csv("data/annotations/results/0421/gpt_evaluator.csv", index=False)

0it [00:00, ?it/s]

100it [01:31,  1.09it/s]


In [5]:
import pandas as pd

with open("data/annotations/results/0421/gpt_evaluator.csv", "r") as f:
    df = pd.read_csv(f)


def get_accuracy(col, gt):
    total = 0
    correct = 0
    for index, row in df.iterrows():
        total += 1
        if row[col] == row[gt]:
            correct += 1
    return correct / total


print(get_accuracy("gpt4_evaluator", "human_evaluator"))
print(get_accuracy("gpt4_reasoning_evaluator", "human_evaluator"))
print(get_accuracy("gpt4_evaluator (w/ updated prompt)", "human_evaluator"))

print(get_accuracy("gpt4_evaluator", "human_reasoning_evaluator"))
print(get_accuracy("gpt4_reasoning_evaluator", "human_reasoning_evaluator"))
print(get_accuracy("gpt4_evaluator (w/ updated prompt)", "human_reasoning_evaluator"))

0.92
0.86
0.93
0.86
0.87
0.88
