In [1]:
import pandas as pd
import numpy as np
import json

csv_paths = ["data/annotations/raw/CZ_3.csv", "data/annotations/raw/CZ_4.csv"]
for csv_path in csv_paths:
    marker = csv_path.split("/")[-1].split(".")[0]

    df = pd.read_csv(csv_path)
    # modify the column name
    for col in df.columns:
        df = df.rename(columns={col: col.strip().strip("\\n")})

    # replace nan with ""
    df = df.replace(np.nan, "")

    video = None
    video_list = []

    for row in df.iterrows():
        row = row[1]
        if row["视频序号"]:
            if video:
                video_list.append(video)
            video = {}
            video["video_id"] = marker + "_" + str(int(row["视频序号"]))
            video["video_url"] = row["视频_url"]
            video["video_duration"] = row["视频长度（min）"]
            video["video_type"] = row["视频类型"]
            video["qa_list"] = []
        qa = {
            "question": row["问题"],
            "answer": row["答案"],
            "question_type": row["问题类型"],
            "knowledge": row["若涉及通用知识推理，则写出推理步骤（知识点）"],
            "reasoning": row["依据（解题思路）"],
        }
        video["qa_list"].append(qa)

    video_list.append(video)

    print(f"Detected {len(video_list)} videos from {marker}")

    # save the video_list to a json file
    with open(f"data/annotations/raw/{marker}.json", "w") as f:
        json.dump(video_list, f, ensure_ascii=False, indent=4)

Detected 100 videos from CZ_3
Detected 60 videos from CZ_4


In [2]:
import os
import json

processing_config = json.load(open("configs/processing_config.json"))
log_dir = processing_config["log_dir"]
os.makedirs(log_dir, exist_ok=True)

count = 0
annotaions_paths = [
    "data/annotations/raw/CZ_3.json",
    "data/annotations/raw/CZ_4.json"
]

for annotaions_path in annotaions_paths:
    marker = annotaions_path.split("/")[-1].split(".")[0]
    with open(annotaions_path, "r") as f:
        data = json.load(f)

    for video in data:
        url = video["video_url"]
        if not url.startswith("https://www.youtube.com/watch?v="):
            count += 1
            with open(
                os.path.join(log_dir, f"annotation_processing_error.log"), "a"
            ) as f:
                f.write(f"Error video url from {marker}: {url}" + "\n")

print(f"Total error video urls: {count}")

Total error video urls: 0


In [3]:
from utils.chat_api import generate_messages, parallel_get_response
from prompts import prompt_refine_qa_list
from utils.general import validate_and_fix_python_list
import json
from tqdm import tqdm

annotaions_paths = [
    "data/annotations/raw/CZ_3.json",
    "data/annotations/raw/CZ_4.json"
]

for annotaions_path in tqdm(annotaions_paths):
    marker = annotaions_path.split("/")[-1].split(".")[0]
    with open(annotaions_path, "r") as f:
        data = json.load(f)

    inputs = []

    for video in data:
        qa_list = video["qa_list"]
        qa_list = [
            {
                "question": qa["question"],
                "answer": qa["answer"],
                "reasoning": qa["reasoning"],
            }
            for qa in qa_list
        ]
        input = [
            {
                "type": "text",
                "content": prompt_refine_qa_list.format(qa_list=qa_list),
            }
        ]
        inputs.append(input)

    messages = [generate_messages(input) for input in inputs]
    model = "gpt-4o-2024-08-06"

    responses = parallel_get_response(model, messages)[0]

    for video, response in zip(data, responses):
        translated_qa_list = validate_and_fix_python_list(response)
        for qa, translated_qa in zip(video["qa_list"], translated_qa_list):
            qa["question"] = translated_qa["question"]
            qa["answer"] = translated_qa["answer"]
            qa["reasoning"] = translated_qa["reasoning"]

    with open(f"data/annotations/raw/{marker}_refined.json", "w") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

100%|██████████| 2/2 [00:58<00:00, 29.44s/it]


In [4]:
import os
import json
from tqdm import tqdm

def parse_url(url):
    if not url.startswith("https://www.youtube.com/"):
        return None
    if "&" in url:
        pruned_url = url.split("&")[0]
    else:
        pruned_url = url
    return pruned_url, pruned_url.split("watch?v=")[1]

annotations_paths = [
    "data/annotations/raw/CZ_3_refined.json",
    "data/annotations/raw/CZ_4_refined.json"
]
base_save_dir = "/mnt/hdfs/foundation/longlin.kylin/mmagent/data/raw_videos"

error_count = 0

for annotations_path in annotations_paths:
    marker = annotations_path.split("/")[-1].split(".")[0].strip("_refined")
    target_dir = marker + "/"
    save_dir = os.path.join(base_save_dir, marker)
    os.makedirs(save_dir, exist_ok=True)
    with open(annotations_path, "r") as f:
        videos = json.load(f)

    for video in tqdm(videos):
        url = video["video_url"]
        if not url.startswith("https://www.youtube.com/watch?v="):
            print(url)
            video["path"] = ""
            error_count += 1
            continue
        pruned_url, video_id = parse_url(url)
        save_file = os.path.join(save_dir, video_id + ".mp4")
        video["path"] = save_file
        video["video_url"] = pruned_url
    
    with open(annotations_path, "w") as f:
        json.dump(videos, f, indent=4, ensure_ascii=False)

print(error_count)

100%|██████████| 100/100 [00:00<00:00, 419430.40it/s]
100%|██████████| 60/60 [00:00<00:00, 258641.56it/s]

0





In [None]:
import json
import os

small_test = []
samples_per_file = 50
with open("data/annotations/raw/CZ_1_refined.json", "r") as f:
    CZ = json.load(f)
with open("data/annotations/raw/ZZ_1_refined.json", "r") as f:
    ZZ = json.load(f)

count = 0
for video in CZ:
    if not video["video_url"].startswith("https://www.youtube.com/watch?v="):
        continue
    if os.path.exists(video["path"]):
        small_test.append(video)
        count += 1
        if count >= samples_per_file:
            break

count = 0
for video in ZZ:
    if not video["video_url"].startswith("https://www.youtube.com/watch?v="):
        continue
    if os.path.exists(video["path"]):
        small_test.append(video)
        count += 1
        if count >= samples_per_file:
            break

with open("data/annotations/small_test.json", "w") as f:
    json.dump(small_test, f, indent=4, ensure_ascii=False)

In [None]:
import json

data_list = [
    "data/annotations/raw/CZ_1_refined.json",
    "data/annotations/raw/CZ_2_refined.json",
    "data/annotations/raw/CZ_3_refined.json",
    "data/annotations/raw/ZZ_1_refined.json",
    "data/annotations/raw/ZZ_2_refined.json",
    "data/annotations/raw/ZZ_3_refined.json",
    "data/annotations/raw/ZZ_4_refined.json", 
]
test_set = json.load(open("data/annotations/small_test.json", "r"))
test_id = [video["video_id"] for video in test_set]
small_training_set = []
for data in data_list:
    with open(data, "r") as f:
        data = json.load(f)
    for video in data:
        if video["video_id"] not in test_id:
            small_training_set.append(video)

print(len(test_id))
print(len(small_training_set))
with open("data/annotations/small_train.json", "w") as f:
    json.dump(small_training_set, f)

In [None]:
import json
from utils.video_processing import get_video_info

with open("data/annotations/small_train.json", "r") as f:
    videos = json.load(f)

filtered_videos = []
filtered_video_ids = []

for video in videos:
    try:
        video_info = get_video_info(video["path"])
        video_id = video["path"].split("&")[0].split("watch?v=")[-1]
        if video_info["height"] == 720 and video_id not in filtered_video_ids:
            filtered_videos.append(video)
            filtered_video_ids.append(video_id)
    except Exception as e:
        continue

print(len(filtered_videos))
with open("data/annotations/train_500.json", "w") as f:
    json.dump(filtered_videos[:500], f)

In [None]:
import json
from utils.general import *

processing_config = json.load(open("configs/processing_config.json"))
save_dir = os.path.join(processing_config["save_dir"])
clip_dir = processing_config["input_dir"]


def add_paths(data):
    with open(data, "r") as f:
        videos = json.load(f)
    for video in videos:
        video_id = video["path"].split("/")[-1].split(".")[0]
        marker = video["video_id"][:4]
        save_path = os.path.join(
            save_dir, marker, generate_file_name(video["path"]) + ".pkl"
        )
        clip_path = os.path.join(clip_dir, marker, video_id)
        if os.path.exists(save_path):
            video["mem_path"] = save_path
        else:
            video["mem_path"] = ""
        if os.path.exists(clip_path):
            video["clip_path"] = clip_path
        else:
            video["clip_path"] = ""
    with open(data, "w") as f:
        json.dump(videos, f, indent=4, ensure_ascii=False)


add_paths("data/annotations/small_test.json")
add_paths("data/annotations/small_train.json")
add_paths("data/annotations/train_500.json")

In [None]:
import json


def convert_to_jsonl(data):
    with open(data, "r") as f:
        videos = json.load(f)
    qas = []
    for video in videos:
        for qa in video["qa_list"]:
            qas.append(
                {
                    "video_id": video["video_id"],
                    "video_url": video["video_url"],
                    "video_path": video["path"],
                    "clip_path": video["clip_path"],
                    "mem_path": video["mem_path"],
                    "question": qa["question"],
                    "answer": qa["answer"],
                    "reasoning": qa["reasoning"]
                }
            )
    with open(f"data/annotations/{data.split('/')[-1].split('.')[0]}.jsonl", "w") as f:
        for qa in qas:
            f.write(json.dumps(qa) + "\n")


convert_to_jsonl("data/annotations/small_test.json")
convert_to_jsonl("data/annotations/small_train.json")
convert_to_jsonl("data/annotations/train_500.json")

In [None]:
def check_if_contains_mem(data):
    count = 0
    with open(data, "r") as f:
        videos = json.load(f)
    for video in videos:
        if video["mem_path"]:
            count += 1
    return count


print(check_if_contains_mem("data/annotations/small_train.json"))
print(check_if_contains_mem("data/annotations/small_test.json"))
print(check_if_contains_mem("data/annotations/train_500.json"))

In [2]:
import json

filtered_questions = []

with open(
    "/mnt/bn/videonasi18n/longlin.kylin/mmagent/data/annotations/results/0416/baseline_blindly_answers_verified.jsonl",
    "r",
) as f:
    for i, line in enumerate(f):
        sample = json.loads(line)
        if sample["verify_result"].lower().startswith("yes"):
            filtered_questions.append(i)

with open(
    "/mnt/bn/videonasi18n/longlin.kylin/mmagent/data/annotations/results/0417/verified_small_test.json",
    "r",
) as f:
    samples = json.load(f)
    for i, sample in enumerate(samples):
        if sample["verify_result"].lower().startswith("yes"):
            filtered_questions.append(i)

filtered_questions = list(set(filtered_questions))

print(len(filtered_questions))

data_with_flags = []
with open("data/annotations/small_test.jsonl", "r") as f:
    for i, line in enumerate(f):
        sample = json.loads(line)
        if i in filtered_questions:
            sample["flag"] = True
        else:
            sample["flag"] = False
        data_with_flags.append(sample)

with open("data/annotations/small_test.jsonl", "w") as f:
    for sample in data_with_flags:
        f.write(json.dumps(sample) + "\n")

164
