In [1]:
from utils import *
from prompts import *
import tempfile
import ffmpeg
import cv2
import base64
import os
import numpy as np

In [2]:
def get_video_info(video_path):
    video_info = {}
    video = cv2.VideoCapture(video_path)
    if not video.isOpened():
        raise ValueError("Could not open video file")
    video_info["fps"] = video.get(cv2.CAP_PROP_FPS)
    video_info["frames"] = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    video_info["duration"] = video_info["frames"] / video_info["fps"]
    video_info["path"] = video_path
    video_info["name"] = video_path.split("/")[-1]
    video_info["width"] = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
    video_info["height"] = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_info["codec"] = video.get(cv2.CAP_PROP_CODEC_PIXEL_FORMAT)
    video_info["format"] = video.get(cv2.CAP_PROP_FORMAT)
    video_info["fourcc"] = video.get(cv2.CAP_PROP_FOURCC)
    video.release()

    # print video info
    print(video_info)

    return video_info


def process_video_clip(video_path, start_time, interval):
    try:
        base64_data = {}
        # Create temporary files
        temp_files = {
            "video": tempfile.NamedTemporaryFile(delete=False, suffix=".mp4"),
        }
        temp_paths = {k: f.name for k, f in temp_files.items()}
        for f in temp_files.values():
            f.close()

        # Extract video segment
        stream = ffmpeg.input(video_path, ss=start_time, t=interval)
        stream = ffmpeg.output(
            stream,
            temp_paths["video"],
            format="mp4",
            acodec="aac",
            vcodec="libx264",
            loglevel="quiet",  # Suppress ffmpeg logs
        )
        ffmpeg.run(stream, overwrite_output=True, quiet=True)  # Run quietly

        # Read files and convert to Base64
        for key, path in temp_paths.items():
            with open(path, "rb") as f:
                base64_data[key] = base64.b64encode(f.read()).decode("utf-8")
            os.remove(path)

        return base64_data["video"]

    except ffmpeg.Error as e:
        print("FFmpeg Error:", e.stderr.decode())
        raise

In [3]:
def answer_question(video_clips, qa, model):
    question = qa["question"]
    gt = qa["answer"]

    inputs = [
        [
            {"type": "video_base64", "content": clip},
            {"type": "text", "content": prompt_baseline_answer_clipwise_extract},
            {"type": "text", "content": f"Question: {question}"},
            {"type": "text", "content": "Extracted information:"},
        ]
        for clip in video_clips
    ]
    messages = [generate_messages(input) for input in inputs]
    responses = parallel_get_response(model, messages)

    extracted_information = [
        response for response in responses[0] if not response.lower().startswith("none")
    ]
    qa["extracted_information"] = extracted_information
    if len(extracted_information) == 0:
        answer = "Unanswerable."
    else:
        input = [
            {"type": "text", "content": prompt_baseline_answer_clipwise_summarize},
            {"type": "text", "content": f"Question: {question}"},
            {
                "type": "text",
                "content": f"Extracted information: {extracted_information}",
            },
            {"type": "text", "content": "Answer:"},
        ]
        messages = generate_messages(input)
        model = "gpt-4o-2024-11-20"
        response = get_response_with_retry(model, messages)
        answer = response[0]

    qa["answer_baselines"] = answer
    return qa


def verify_answers(qas):
    inputs = [
        [
            {
                "type": "text",
                "content": qa,
            },
            {
                "type": "text",
                "content": prompt_benchmark_verify_answer,
            },
            {
                "type": "text",
                "content": "Now answer if the answer from the baseline is correct or not:",
            },
        ]
        for qa in qas
    ]
    messages = [generate_messages(input) for input in inputs]
    model = "gpt-4o-2024-11-20"
    responses = parallel_get_response(model, messages)

    results = responses[0]

    # calculate the accuracy of the answers
    correct = 0
    for result in results:
        if result.lower().startswith("yes"):
            correct += 1
    accuracy = correct / len(results)

    return accuracy, results


def process_video(video, interval_seconds):
    """Process video segments at specified intervals with given fps.

    Args:
        video_path (str): Path to the video file
        interval_seconds (float): Time interval between segments in seconds
        fps (float): Frames per second to extract from each segment

    Returns:
        None
    """

    video_path = video["path"]
    qa_list = video["qa_list"]

    video_info = get_video_info(video_path)
    print(video_info)

    clips = []

    # Process each interval
    for start_time in np.arange(0, video_info["duration"], interval_seconds):

        base64_video = process_video_clip(video_path, start_time, interval_seconds)
        clips.append(base64_video)

    model = "gemini-1.5-pro-002"
    qpm = config[model]["qpm"]
    qa_batch_size = qpm // len(clips)
    qa_batches = [
        qa_list[i : i + qa_batch_size] for i in range(0, len(qa_list), qa_batch_size)
    ]

    answered_qa_list = []

    for qa_batch in qa_batches:
        # parallel question answering with multiple threads
        with ThreadPoolExecutor(max_workers=len(qa_batch)) as executor:
            futures = [
                executor.submit(answer_question, clips, qa, model) for qa in qa_batch
            ]
            for future in as_completed(futures):
                qa = future.result()
                answered_qa_list.append(qa)

    video["qa_list"] = answered_qa_list
    return video

In [4]:
import json

with open("data/annotations/video_list_CZ_modified.json", "r") as f:
    data = json.load(f)

for video in data:
    # print(video)
    process_video(video, 180)

with open("data/annotations/video_list_CZ_answer_clipwise.json", "w") as f:
    json.dump(data, f, indent=4)

{'fps': 23.976023976023978, 'frames': 41829, 'duration': 1744.617875, 'path': 'data/videos/raw/360p/PnvZZwlN2yk.mp4', 'name': 'PnvZZwlN2yk.mp4', 'width': 640, 'height': 360, 'codec': 808596553.0, 'format': 0.0, 'fourcc': 875967080.0}
{'fps': 23.976023976023978, 'frames': 41829, 'duration': 1744.617875, 'path': 'data/videos/raw/360p/PnvZZwlN2yk.mp4', 'name': 'PnvZZwlN2yk.mp4', 'width': 640, 'height': 360, 'codec': 808596553.0, 'format': 0.0, 'fourcc': 875967080.0}


ffmpeg version 5.1.6-0+deb12u1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 12 (Debian 12.2.0-14)
  configuration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enab

frame= 3977 fps=337 q=-1.0 Lsize=    8727kB time=00:02:46.30 bitrate= 429.9kbits/s speed=14.1x    
video:5997kB audio:2606kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 1.447679%
[libx264 @ 0x55da670f7f40] frame I:68    Avg QP:16.50  size: 14254
[libx264 @ 0x55da670f7f40] frame P:1262  Avg QP:21.93  size:  2805
[libx264 @ 0x55da670f7f40] frame B:2647  Avg QP:25.59  size:   616
[libx264 @ 0x55da670f7f40] consecutive B-frames:  5.2% 14.8% 10.0% 70.0%
[libx264 @ 0x55da670f7f40] mb I  I16..4: 32.9% 32.5% 34.6%
[libx264 @ 0x55da670f7f40] mb P  I16..4:  1.7%  4.3%  1.5%  P16..4: 21.5% 10.2%  4.5%  0.0%  0.0%    skip:56.3%
[libx264 @ 0x55da670f7f40] mb B  I16..4:  0.1%  0.2%  0.2%  B16..8: 21.0%  3.2%  0.5%  direct: 0.4%  skip:74.4%  L0:45.9% L1:48.0% BI: 6.1%
[libx264 @ 0x55da670f7f40] 8x8 transform intra:46.7% inter:49.0%
[libx264 @ 0x55da670f7f40] coded y,uvDC,uvAC intra: 46.1% 45.4% 20.7% inter: 3.9% 2.7% 0.2%
[libx264 @ 0x55da670f7f40] i16 v,h,dc,p: 35% 26% 14% 25%

KeyboardInterrupt: 