In [None]:
from utils import *
from prompts import *
import tempfile
import ffmpeg
import cv2
import base64
import os
import numpy as np

In [None]:
def get_video_info(video_path):
    video_info = {}
    video = cv2.VideoCapture(video_path)
    if not video.isOpened():
        raise ValueError("Could not open video file")
    video_info["fps"] = video.get(cv2.CAP_PROP_FPS)
    video_info["frames"] = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    video_info["duration"] = video_info["frames"] / video_info["fps"]
    video_info["path"] = video_path
    video_info["name"] = video_path.split("/")[-1]
    video_info["width"] = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
    video_info["height"] = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_info["codec"] = video.get(cv2.CAP_PROP_CODEC_PIXEL_FORMAT)
    video_info["format"] = video.get(cv2.CAP_PROP_FORMAT)
    video_info["fourcc"] = video.get(cv2.CAP_PROP_FOURCC)
    video.release()

    # print video info
    print(video_info)

    return video_info


def extract_frames(video_path, start_time=None, interval=None, sample_fps=10):
    video_info = get_video_info(video_path)
    # if start_time and interval are not provided, sample the whole video at sample_fps
    if start_time is None and interval is None:
        start_time = 0
        interval = video_info["duration"]
    video_fps = video_info["fps"]
    total_frames = video_info["frames"]
    frame_interval = int(video_fps / sample_fps)

    frames = []
    segment_video = cv2.VideoCapture(video_path)
    segment_video.set(cv2.CAP_PROP_POS_FRAMES, int(start_time * video_fps))
    end_frame = min(int((start_time + interval) * video_fps), total_frames)

    for frame_idx in range(int(start_time * video_fps), end_frame, frame_interval):
        segment_video.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = segment_video.read()
        if not ret:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        frames.append(base64.b64encode(buffer).decode("utf-8"))

    segment_video.release()
    return frames


def process_video_clip(video_path, start_time, interval, fps):
    try:
        base64_data = {}
        # Create temporary files
        temp_files = {
            "video": tempfile.NamedTemporaryFile(delete=False, suffix=".mp4"),
            "audio": tempfile.NamedTemporaryFile(delete=False, suffix=".mp3"),
        }
        temp_paths = {k: f.name for k, f in temp_files.items()}
        for f in temp_files.values():
            f.close()

        # Extract video segment
        stream = ffmpeg.input(video_path, ss=start_time, t=interval)
        stream = ffmpeg.output(
            stream, temp_paths["video"], format="mp4", acodec="aac", vcodec="libx264"
        )
        ffmpeg.run(stream, overwrite_output=True)

        # Extract audio
        audio_stream = ffmpeg.input(temp_paths["video"])
        audio_stream = ffmpeg.output(
            audio_stream, temp_paths["audio"], acodec="libmp3lame"
        )
        ffmpeg.run(audio_stream, overwrite_output=True)

        # Read files and convert to Base64
        for key, path in temp_paths.items():
            with open(path, "rb") as f:
                base64_data[key] = base64.b64encode(f.read()).decode("utf-8")
            os.remove(path)

        base64_data["frames"] = extract_frames(video_path, start_time, interval, fps)

        return base64_data["video"], base64_data["frames"], base64_data["audio"]

    except ffmpeg.Error as e:
        print("FFmpeg Error:", e.stderr.decode())
        raise

In [None]:
def answer_question(video_clips, qa, model):
    question = qa["question"]
    gt = qa["answer"]

    inputs = [
        [
            {"type": "video_base64", "content": clip},
            {"type": "text", "content": prompt_baseline_answer_clipwise_extract},
            {"type": "text", "content": f"Question: {question}"},
            {"type": "text", "content": "Extracted information:"},
        ]
        for clip in video_clips
    ]
    messages = [generate_messages(input) for input in inputs]
    responses = parallel_get_response(model, messages)

    extracted_information = [
        response for response in responses[0] if not response.lower().startswith("none")
    ]
    if len(extracted_information) == 0:
        answer = "Unanswerable."
    else:
        input = [
            {"type": "text", "content": prompt_baseline_answer_clipwise_summarize},
            {"type": "text", "content": f"Question: {question}"},
            {
                "type": "text",
                "content": f"Extracted information: {extracted_information}",
            },
            {"type": "text", "content": "Answer:"},
        ]
        messages = generate_messages(input)
        model = "gpt-4o-2024-05-13"
        response = get_response_with_retry(model, messages)
        answer = response[0]

    qa["answer_baselines"] = answer
    return qa


def verify_answers(qas):
    inputs = [
        [
            {
                "type": "text",
                "content": qa,
            },
            {
                "type": "text",
                "content": prompt_benchmark_verify_answer,
            },
            {
                "type": "text",
                "content": "Now answer if the answer from the baseline is correct or not:",
            },
        ]
        for qa in qas
    ]
    messages = [generate_messages(input) for input in inputs]
    model = "gpt-4o-2024-05-13"
    responses = parallel_get_response(model, messages)

    results = responses[0]

    # calculate the accuracy of the answers
    correct = 0
    for result in results:
        if result.lower().startswith("yes"):
            correct += 1
    accuracy = correct / len(results)

    return accuracy, results


def process_video(video_path, interval_seconds, fps, qa_list):
    """Process video segments at specified intervals with given fps.

    Args:
        video_path (str): Path to the video file
        interval_seconds (float): Time interval between segments in seconds
        fps (float): Frames per second to extract from each segment

    Returns:
        None
    """

    video_info = get_video_info(video_path)
    print(video_info)

    clips = []

    # Process each interval
    for start_time in np.arange(0, video_info["duration"], interval_seconds):

        base64_video, _, _ = process_video_clip(
            video_path, start_time, interval_seconds, fps
        )
        clips.append(base64_video)

    model = "gemini-1.5-pro-002"
    qpm = config[model]["qpm"]
    qa_batch_size = qpm // len(clips)
    qa_batches = [
        qa_list[i : i + qa_batch_size] for i in range(0, len(qa_list), qa_batch_size)
    ]

    answered_qa_list = []

    for qa_batch in qa_batches:
        # parallel question answering with multiple threads
        with ThreadPoolExecutor(max_workers=len(qa_batch)) as executor:
            futures = [
                executor.submit(answer_question, clips, qa, model) for qa in qa_batch
            ]
            for future in as_completed(futures):
                qa = future.result()
                answered_qa_list.append(qa)

    return answered_qa_list