In [6]:
import pandas as pd
import numpy as np
import json

csv_path = "data/annotations/CZ_1.csv"
marker = csv_path.split("/")[-1].split(".")[0]

df = pd.read_csv(csv_path)
# modify the column name
for col in df.columns:
    df = df.rename(columns={col: col.strip().strip("\\n")})

# replace nan with ""
df = df.replace(np.nan, "")

video = None
video_list = []

for row in df.iterrows():
    row = row[1]
    if row["视频序号"]:
        if video:
            video_list.append(video)
        video = {}
        video["video_id"] = marker + "_" + str(int(row["视频序号"]))
        video["video_url"] = row["视频_url"]
        video["video_duration"] = row["视频长度（min）"]
        video["video_type"] = row["视频类型"]
        video["qa_list"] = []
    qa = {
        "question": row["问题"],
        "answer": row["答案"],
        "question_type": row["问题类型"],
        "knowledge": row["若涉及通用知识推理，则写出推理步骤（知识点）"],
        "reasoning": row["依据（解题思路）"],
    }
    video["qa_list"].append(qa)

video_list.append(video)

print(len(video_list))
print(video_list[0])

# save the video_list to a json file
with open(f"data/annotations/{marker}.json", "w") as f:
    json.dump(video_list, f, ensure_ascii=False, indent=4)

100
{'video_id': 'CZ_1_1', 'video_url': 'https://www.youtube.com/watch?v=Efk3K4epEzg', 'video_duration': '20:13', 'video_type': '综艺 - 纪实（真人秀）', 'qa_list': [{'question': 'Which collection has the highest starting price？', 'answer': 'Pirate Ship Float', 'question_type': '多线索推理', 'knowledge': None, 'reasoning': '一共五件藏品线性出现，模型需要在多个时间片段获得藏品的要价信息进行比对。'}, {'question': 'What did the authentication expert do after examining the ink marks on the Led Zeppelin album? ', 'answer': 'Compare the handwriting.', 'question_type': '多跳推理', 'knowledge': None, 'reasoning': '模型要先定位到鉴定专辑墨迹的环节，在分析鉴定专家的行为，从而知道他再比对笔迹'}, {'question': "Which collection is Boss's favorite", 'answer': 'the album of Led Zeppelin', 'question_type': '多跳推理,人物属性建模', 'knowledge': None, 'reasoning': '模型应首先从第五位卖家片段得知Rick是老板，再搜索Rick对不同藏品的评价和反应，从Rick要将专辑收藏而非出售知道他最喜欢齐柏林飞艇的专辑'}, {'question': "Whether Rick trusts Trump's abilities", 'answer': 'NO', 'question_type': '多线索推理,人物属性建模', 'knowledge': None, 'reasoning': '从Rick给Trump发短信指挥他工作，并在和他人对话中说tru

In [7]:
annotaions_path = "data/annotations/CZ_1.json"
marker = annotaions_path.split("/")[-1].split(".")[0]
with open(annotaions_path, "r") as f:
    data = json.load(f)

count = 0

for video in data:
    url = video["video_url"]
    if not url.startswith("https://www.youtube.com/"):
        count += 1
        with open(f"data/annotations/{marker}_error_video_url.txt", "a") as f:
            f.write(url + "\n")

print(f"Total error video urls: {count}")

Total error video urls: 1


In [9]:
from utils.chat_api import generate_messages, parallel_get_response
from prompts import prompt_refine_qa_list
from utils.general import validate_and_fix_python_list
import json

annotaions_path = "data/annotations/CZ_1.json"
marker = annotaions_path.split("/")[-1].split(".")[0]
with open(annotaions_path, "r") as f:
    data = json.load(f)

inputs = []

for video in data:
    qa_list = video["qa_list"]
    qa_list = [{"question": qa["question"], "answer": qa["answer"]} for qa in qa_list]
    input = [{"type": "text", "content": prompt_refine_qa_list.format(qa_list=qa_list)}]
    inputs.append(input)

messages = [generate_messages(input) for input in inputs]
model = "gpt-4o-2024-05-13"

responses = parallel_get_response(model, messages)[0]

for video, response in zip(data, responses):
    refined_qa_list = validate_and_fix_python_list(response)
    for qa, refined_qa in zip(video["qa_list"], refined_qa_list):
        print(f"Original question: {qa['question']}")
        print(f"Refined question: {refined_qa['question']}")
        print(f"Original answer: {qa['answer']}")
        print(f"Refined answer: {refined_qa['answer']}")
        qa["question"] = refined_qa["question"]
        qa["answer"] = refined_qa["answer"]
        print("-" * 100)
    print("=" * 100)

with open(f"data/annotations/{marker}_refined.json", "w") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

Original question: Which collection has the highest starting price？
Refined question: Which collection has the highest starting price?
Original answer: Pirate Ship Float
Refined answer: Pirate Ship Float.
----------------------------------------------------------------------------------------------------
Original question: What did the authentication expert do after examining the ink marks on the Led Zeppelin album? 
Refined question: What did the authentication expert do after examining the ink marks on the Led Zeppelin album?
Original answer: Compare the handwriting.
Refined answer: Compared the handwriting.
----------------------------------------------------------------------------------------------------
Original question: Which collection is Boss's favorite
Refined question: Which collection is Boss's favorite?
Original answer: the album of Led Zeppelin
Refined answer: The album of Led Zeppelin.
-------------------------------------------------------------------------------------