In [1]:
import pandas as pd
import numpy as np
import json

csv_paths = ["data/annotations/ZZ_4_2.csv", "data/annotations/ZZ_5.csv"]
for csv_path in csv_paths:
    marker = csv_path.split("/")[-1].split(".")[0]

    df = pd.read_csv(csv_path)
    # modify the column name
    for col in df.columns:
        df = df.rename(columns={col: col.strip().strip("\\n")})

    # replace nan with ""
    df = df.replace(np.nan, "")

    video = None
    video_list = []

    for row in df.iterrows():
        row = row[1]
        if row["视频序号"]:
            if video:
                video_list.append(video)
            video = {}
            video["video_id"] = marker + "_" + str(int(row["视频序号"]))
            video["video_url"] = row["视频_url"]
            video["video_duration"] = row["视频长度（min）"]
            video["video_type"] = row["视频类型"]
            video["qa_list"] = []
        qa = {
            "question": row["问题"],
            "answer": row["答案"],
            "question_type": row["问题类型"],
            "knowledge": row["若涉及通用知识推理，则写出推理步骤（知识点）"],
            "reasoning": row["依据（解题思路）"],
        }
        video["qa_list"].append(qa)

    video_list.append(video)

    print(f"Detected {len(video_list)} videos from {marker}")

    # save the video_list to a json file
    with open(f"data/annotations/{marker}.json", "w") as f:
        json.dump(video_list, f, ensure_ascii=False, indent=4)

Detected 30 videos from ZZ_4_2
Detected 100 videos from ZZ_5


In [6]:
import os
import json

processing_config = json.load(open("configs/processing_config.json"))
log_dir = processing_config["log_dir"]
os.makedirs(log_dir, exist_ok=True)

count = 0
annotaions_paths = ["data/annotations/ZZ_4_2.json", "data/annotations/ZZ_5.json"]

for annotaions_path in annotaions_paths:
    marker = annotaions_path.split("/")[-1].split(".")[0]
    with open(annotaions_path, "r") as f:
        data = json.load(f)

    for video in data:
        url = video["video_url"]
        if not url.startswith("https://www.youtube.com/watch?v="):
            count += 1
            with open(
                os.path.join(log_dir, f"annotation_processing_error.log"), "a"
            ) as f:
                f.write(f"Error video url from {marker}: {url}" + "\n")

print(f"Total error video urls: {count}")

Total error video urls: 0


In [3]:
from utils.chat_api import generate_messages, parallel_get_response
from prompts import prompt_refine_qa_list
from utils.general import validate_and_fix_python_list
import json

annotaions_paths = ["data/annotations/ZZ_4_2.json", "data/annotations/ZZ_5.json"]

for annotaions_path in annotaions_paths:
    marker = annotaions_path.split("/")[-1].split(".")[0]
    with open(annotaions_path, "r") as f:
        data = json.load(f)

    inputs = []

    for video in data:
        qa_list = video["qa_list"]
        qa_list = [
            {"question": qa["question"], "answer": qa["answer"]} for qa in qa_list
        ]
        input = [
            {"type": "text", "content": prompt_refine_qa_list.format(qa_list=qa_list)}
        ]
        inputs.append(input)

    messages = [generate_messages(input) for input in inputs]
    model = "gpt-4o-2024-05-13"

    responses = parallel_get_response(model, messages)[0]

    for video, response in zip(data, responses):
        refined_qa_list = validate_and_fix_python_list(response)
        for qa, refined_qa in zip(video["qa_list"], refined_qa_list):
            print(f"Original question: {qa['question']}")
            print(f"Refined question: {refined_qa['question']}")
            print(f"Original answer: {qa['answer']}")
            print(f"Refined answer: {refined_qa['answer']}")
            qa["question"] = refined_qa["question"]
            qa["answer"] = refined_qa["answer"]
            print("-" * 100)
        print("=" * 100)

    with open(f"data/annotations/{marker}_refined.json", "w") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

Original question: Which one has better remote control car operation skills, Hal or Benji?
Refined question: Who has better remote control car operation skills, Hal or Benji?
Original answer: Hal.
Refined answer: Hal.
----------------------------------------------------------------------------------------------------
Original question: Is Hal proficient in assembling remote-controlled cars?
Refined question: Is Hal proficient in assembling remote-controlled cars?
Original answer: Yes.
Refined answer: Yes.
----------------------------------------------------------------------------------------------------
Original question: What is the marked price of the car that Benji chose?
Refined question: What is the marked price of the car that Benji chose?
Original answer: Twenty-nine dollars and ninety-nine cents.
Refined answer: Twenty-nine dollars and ninety-nine cents.
----------------------------------------------------------------------------------------------------
Original question: Is R

In [None]:
import json
import os

small_test = []
samples_per_file = 50
with open("data/annotations/CZ_1_refined.json", "r") as f:
    CZ = json.load(f)
with open("data/annotations/ZZ_1_refined.json", "r") as f:
    ZZ = json.load(f)

count = 0
for video in CZ:
    if not video["video_url"].startswith("https://www.youtube.com/watch?v="):
        continue
    if os.path.exists(video["path"]):
        small_test.append(video)
        count += 1
        if count >= samples_per_file:
            break

count = 0
for video in ZZ:
    if not video["video_url"].startswith("https://www.youtube.com/watch?v="):
        continue
    if os.path.exists(video["path"]):
        small_test.append(video)
        count += 1
        if count >= samples_per_file:
            break

with open("data/annotations/small_test.json", "w") as f:
    json.dump(small_test, f, indent=4, ensure_ascii=False)

In [None]:
import json

data_list = [
    "data/annotations/CZ_1_refined.json",
    "data/annotations/CZ_2_refined.json",
    "data/annotations/CZ_3_refined.json",
    "data/annotations/ZZ_1_refined.json",
    "data/annotations/ZZ_2_refined.json",
    "data/annotations/ZZ_3_refined.json",
    "data/annotations/ZZ_4_refined.json",
]
test_set = json.load(open("data/annotations/small_test.json", "r"))
test_id = [video["video_id"] for video in test_set]
small_training_set = []
for data in data_list:
    with open(data, "r") as f:
        data = json.load(f)
    for video in data:
        if video["video_id"] not in test_id:
            small_training_set.append(video)

print(len(test_id))
print(len(small_training_set))
with open("data/annotations/small_train.json", "w") as f:
    json.dump(small_training_set, f)

In [1]:
import json
from utils.general import *

processing_config = json.load(open("configs/processing_config.json"))
save_dir = os.path.join(processing_config["save_dir"])
clip_dir = processing_config["input_dir"]


def add_paths(data):
    with open(data, "r") as f:
        videos = json.load(f)
    for video in videos:
        video_id = video["path"].split("/")[-1].split(".")[0]
        marker = video["video_id"][:4]
        save_path = os.path.join(
            save_dir, marker, generate_file_name(video["path"]) + ".pkl"
        )
        clip_path = os.path.join(
            clip_dir, marker, video_id
        )
        if os.path.exists(save_path):
            video["mem_path"] = save_path
        else:
            video["mem_path"] = ""
        if os.path.exists(clip_path):
            video["clip_path"] = clip_path
        else:
            video["clip_path"] = ""
    with open(data, "w") as f:
        json.dump(videos, f, indent=4, ensure_ascii=False)


add_paths("data/annotations/small_test.json")
add_paths("data/annotations/small_train.json")

In [2]:
import json


def convert_to_jsonl(data):
    with open(data, "r") as f:
        videos = json.load(f)
    qas = []
    for video in videos:
        for qa in video["qa_list"]:
            qas.append(
                {
                    "video_id": video["video_id"],
                    "video_url": video["video_url"],
                    "video_path": video["path"],
                    "clip_path": video["clip_path"],
                    "mem_path": video["mem_path"],
                    "question": qa["question"],
                    "answer": qa["answer"],
                }
            )
    with open(f"data/annotations/{data.split('/')[-1].split('.')[0]}.jsonl", "w") as f:
        for qa in qas:
            f.write(json.dumps(qa) + "\n")


convert_to_jsonl("data/annotations/small_test.json")
convert_to_jsonl("data/annotations/small_train.json")

In [3]:
def check_if_contains_mem(data):
    count = 0
    with open(data, "r") as f:
        videos = json.load(f)
    for video in videos:
        if video["mem_path"]:
            count += 1
    return count


print(check_if_contains_mem("data/annotations/small_train.json"))
print(check_if_contains_mem("data/annotations/small_test.json"))

526
100
