In [None]:
import pandas as pd
import numpy as np
import json

csv_paths = ["data/annotations/CZ_1.csv"]
for csv_path in csv_paths:
    marker = csv_path.split("/")[-1].split(".")[0]

    df = pd.read_csv(csv_path)
    # modify the column name
    for col in df.columns:
        df = df.rename(columns={col: col.strip().strip("\\n")})

    # replace nan with ""
    df = df.replace(np.nan, "")

    video = None
    video_list = []

    for row in df.iterrows():
        row = row[1]
        if row["视频序号"]:
            if video:
                video_list.append(video)
            video = {}
            video["video_id"] = marker + "_" + str(int(row["视频序号"]))
            video["video_url"] = row["视频_url"]
            video["video_duration"] = row["视频长度（min）"]
            video["video_type"] = row["视频类型"]
            video["qa_list"] = []
        qa = {
            "question": row["问题"],
            "answer": row["答案"],
            "question_type": row["问题类型"],
            "knowledge": row["若涉及通用知识推理，则写出推理步骤（知识点）"],
            "reasoning": row["依据（解题思路）"],
        }
        video["qa_list"].append(qa)

    video_list.append(video)

    print(f"Detected {len(video_list)} videos from {marker}")

    # save the video_list to a json file
    with open(f"data/annotations/{marker}.json", "w") as f:
        json.dump(video_list, f, ensure_ascii=False, indent=4)

In [13]:
import os
import json

processing_config = json.load(open("configs/processing_config.json"))
log_dir = processing_config["log_dir"]
os.makedirs(log_dir, exist_ok=True)

count = 0
annotaions_paths = ["data/annotations/CZ_1.json"]

for annotaions_path in annotaions_paths:
    marker = annotaions_path.split("/")[-1].split(".")[0]
    with open(annotaions_path, "r") as f:
        data = json.load(f)

    for video in data:
        url = video["video_url"]
        if not url.startswith("https://www.youtube.com/"):
            count += 1
            with open(
                os.path.join(log_dir, f"annotation_processing_error.log"), "a"
            ) as f:
                f.write(f"Error video url from {marker}: {url}" + "\n")

print(f"Total error video urls: {count}")

SyntaxError: unterminated string literal (detected at line 14) (1444185002.py, line 14)

In [None]:
from utils.chat_api import generate_messages, parallel_get_response
from prompts import prompt_refine_qa_list
from utils.general import validate_and_fix_python_list
import json

annotaions_paths = ["data/annotations/CZ_1.json"]

for annotaions_path in annotaions_paths:
    marker = annotaions_path.split("/")[-1].split(".")[0]
    with open(annotaions_path, "r") as f:
        data = json.load(f)

    inputs = []

    for video in data:
        qa_list = video["qa_list"]
        qa_list = [
            {"question": qa["question"], "answer": qa["answer"]} for qa in qa_list
        ]
        input = [
            {"type": "text", "content": prompt_refine_qa_list.format(qa_list=qa_list)}
        ]
        inputs.append(input)

    messages = [generate_messages(input) for input in inputs]
    model = "gpt-4o-2024-05-13"

    responses = parallel_get_response(model, messages)[0]

    for video, response in zip(data, responses):
        refined_qa_list = validate_and_fix_python_list(response)
        for qa, refined_qa in zip(video["qa_list"], refined_qa_list):
            print(f"Original question: {qa['question']}")
            print(f"Refined question: {refined_qa['question']}")
            print(f"Original answer: {qa['answer']}")
            print(f"Refined answer: {refined_qa['answer']}")
            qa["question"] = refined_qa["question"]
            qa["answer"] = refined_qa["answer"]
            print("-" * 100)
        print("=" * 100)

    with open(f"data/annotations/{marker}_refined.json", "w") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)