In [2]:
from utils import *
from prompts import *
import tempfile
import ffmpeg
import cv2
import base64
import os
import numpy as np

FileNotFoundError: [Errno 2] No such file or directory: 'api_config.json'

In [None]:
def get_video_info(video_path):
    video_info = {}
    video = cv2.VideoCapture(video_path)
    if not video.isOpened():
        raise ValueError("Could not open video file")
    video_info["fps"] = video.get(cv2.CAP_PROP_FPS)
    video_info["frames"] = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    video_info["duration"] = video_info["frames"] / video_info["fps"]
    video_info["path"] = video_path
    video_info["name"] = video_path.split("/")[-1]
    video_info["width"] = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
    video_info["height"] = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_info["codec"] = video.get(cv2.CAP_PROP_CODEC_PIXEL_FORMAT)
    video_info["format"] = video.get(cv2.CAP_PROP_FORMAT)
    video_info["fourcc"] = video.get(cv2.CAP_PROP_FOURCC)
    video.release()

    # print video info
    print(video_info)

    return video_info


def process_video_clip(video_path, start_time, interval):
    try:
        base64_data = {}
        # Create temporary files
        temp_files = {
            "video": tempfile.NamedTemporaryFile(delete=False, suffix=".mp4"),
        }
        temp_paths = {k: f.name for k, f in temp_files.items()}
        for f in temp_files.values():
            f.close()

        # Extract video segment
        stream = ffmpeg.input(video_path, ss=start_time, t=interval)
        stream = ffmpeg.output(
            stream, temp_paths["video"], format="mp4", acodec="aac", vcodec="libx264"
        )
        ffmpeg.run(stream, overwrite_output=True)

        # Read files and convert to Base64
        for key, path in temp_paths.items():
            with open(path, "rb") as f:
                base64_data[key] = base64.b64encode(f.read()).decode("utf-8")
            os.remove(path)

        return base64_data["video"]

    except ffmpeg.Error as e:
        print("FFmpeg Error:", e.stderr.decode())
        raise

In [None]:
def answer_question(video_clips, qa, model):
    question = qa["question"]
    gt = qa["answer"]

    inputs = [
        [
            {"type": "video_base64", "content": clip},
            {"type": "text", "content": prompt_baseline_answer_clipwise_extract},
            {"type": "text", "content": f"Question: {question}"},
            {"type": "text", "content": "Extracted information:"},
        ]
        for clip in video_clips
    ]
    messages = [generate_messages(input) for input in inputs]
    responses = parallel_get_response(model, messages)

    extracted_information = [
        response for response in responses[0] if not response.lower().startswith("none")
    ]
    qa["extracted_information"] = extracted_information
    if len(extracted_information) == 0:
        answer = "Unanswerable."
    else:
        input = [
            {"type": "text", "content": prompt_baseline_answer_clipwise_summarize},
            {"type": "text", "content": f"Question: {question}"},
            {
                "type": "text",
                "content": f"Extracted information: {extracted_information}",
            },
            {"type": "text", "content": "Answer:"},
        ]
        messages = generate_messages(input)
        model = "gpt-4o-2024-11-20"
        response = get_response_with_retry(model, messages)
        answer = response[0]

    qa["answer_baselines"] = answer
    return qa


def verify_answers(qas):
    inputs = [
        [
            {
                "type": "text",
                "content": qa,
            },
            {
                "type": "text",
                "content": prompt_benchmark_verify_answer,
            },
            {
                "type": "text",
                "content": "Now answer if the answer from the baseline is correct or not:",
            },
        ]
        for qa in qas
    ]
    messages = [generate_messages(input) for input in inputs]
    model = "gpt-4o-2024-11-20"
    responses = parallel_get_response(model, messages)

    results = responses[0]

    # calculate the accuracy of the answers
    correct = 0
    for result in results:
        if result.lower().startswith("yes"):
            correct += 1
    accuracy = correct / len(results)

    return accuracy, results


def process_video(video, interval_seconds):
    """Process video segments at specified intervals with given fps.

    Args:
        video_path (str): Path to the video file
        interval_seconds (float): Time interval between segments in seconds
        fps (float): Frames per second to extract from each segment

    Returns:
        None
    """

    video_path = video["path"]
    qa_list = video["qa_list"]

    video_info = get_video_info(video_path)
    print(video_info)

    clips = []

    # Process each interval
    for start_time in np.arange(0, video_info["duration"], interval_seconds):

        base64_video = process_video_clip(video_path, start_time, interval_seconds)
        clips.append(base64_video)

    model = "gemini-1.5-pro-002"
    qpm = config[model]["qpm"]
    qa_batch_size = qpm // len(clips)
    qa_batches = [
        qa_list[i : i + qa_batch_size] for i in range(0, len(qa_list), qa_batch_size)
    ]

    answered_qa_list = []

    for qa_batch in qa_batches:
        # parallel question answering with multiple threads
        with ThreadPoolExecutor(max_workers=len(qa_batch)) as executor:
            futures = [
                executor.submit(answer_question, clips, qa, model) for qa in qa_batch
            ]
            for future in as_completed(futures):
                qa = future.result()
                answered_qa_list.append(qa)

    video["qa_list"] = answered_qa_list
    return video

In [3]:
import json

with open("data/annotations/video_list_CZ_modified.json", "r") as f:
    data = json.load(f)

for video in data:
    # print(video)
    process_video(video, 180)

with open("data/annotations/video_list_CZ_answer_clipwise.json", "w") as f:
    json.dump(data, f, indent=4)

{'video_id': 'CZ_1', 'video_url': 'https://www.youtube.com/watch?v=PnvZZwlN2yk', 'video_duration': '29:04', 'video_type': '综艺 - 游戏', 'qa_list': [{'question': 'Is Stewart Thompson a person who pursues a high-quality life?', 'answer': 'Yes.', 'question_type': '多线索推理,人物属性建模', 'knowledge': None, 'reasoning': '从Stewart讲述自己旅行方式倾向于商务舱可以判断。'}, {'question': 'What decade of age is J.F. Harris approximately in?', 'answer': 'He is approximately in his forties.', 'question_type': '多线索推理,人物属性建模', 'knowledge': None, 'reasoning': 'J.F. Harris提到过自己2003年曾经在芝加哥，后来又说20岁左右在芝加哥，综合推理得知现在J.F. Harris现在大概四十多岁。'}, {'question': 'Who had a different opinion from the others when voting out the person who was not a millionaire in the first round?', 'answer': 'Aaron.', 'question_type': '多跳推理,多模态/多语言推理', 'knowledge': None, 'reasoning': '首先需要定位到第一轮投票这个片段，再结合视觉听觉对五个人的发言判断。'}, {'question': 'What economic conditions do the families of children who have a trust fund usually come from?', 'answer': 'The family possesses subs