In [1]:
import numpy as np
from tqdm import tqdm

from videograph import VideoGraph
from utils.general import *
from utils.video_processing import *
from utils.chat_api import *
from prompts import *

from face_processing import process_faces
from voice_processing import process_voices
from memory_processing import (
    process_captions,
    generate_captions_and_thinkings_with_ids,
)



In [2]:
def process_segment(video_graph, base64_video, base64_frames, base64_audio):

    id2voices = process_voices(video_graph, base64_audio)
    print("Finish processing voices")

    print(f"processing {len(base64_frames)} frames...")

    id2faces = process_faces(video_graph, base64_frames)
    print(id2faces.keys())
    print("Finish processing faces")

    episodic_captions, semantic_captions = generate_captions_and_thinkings_with_ids(
        base64_video,
        base64_frames,
        base64_audio,
        id2faces,
        id2voices,
    )

    process_captions(video_graph, episodic_captions, type="episodic")
    process_captions(video_graph, semantic_captions, type="semantic")

    print("Finish processing segment")


def streaming_process_video(
    video_graph, video_path, interval_seconds, fps, segment_limit=None
):
    """Process video segments at specified intervals with given fps.

    Args:
        video_graph (VideoGraph): Graph object to store video information
        video_path (str): Path to the video file or directory containing clips
        interval_seconds (float): Time interval between segments in seconds
        fps (float): Frames per second to extract from each segment

    Returns:
        None: Updates video_graph in place with processed segments
    """
    import os

    if os.path.isfile(video_path):
        # Process single video file
        video_info = get_video_info(video_path)
        print(video_info)

        # Process each interval
        count = 0
        for start_time in np.arange(0, video_info["duration"], interval_seconds):
            if start_time + interval_seconds > video_info["duration"]:
                break

            print("=" * 20)
            count += 1

            print(f"Loading {count}-th clip starting at {start_time} seconds...")
            base64_video, base64_frames, base64_audio = process_video_clip(
                video_path, start_time, interval_seconds, fps, audio_format="wav"
            )

            # check dtype
            # print(type(base64_video), type(base64_frames[0]), type(base64_audio))

            # Process frames for this interval
            if base64_frames:
                print(
                    f"Starting processing {count}-th clip starting at {start_time} seconds..."
                )
                process_segment(
                    video_graph,
                    base64_video,
                    base64_frames,
                    base64_audio,
                )

            if segment_limit is not None and count >= segment_limit:
                break

    elif os.path.isdir(video_path):
        # Process directory of numbered clips
        files = os.listdir(video_path)
        # Filter for video files and sort by numeric value in filename
        video_files = [
            f for f in files if any(f.endswith(ext) for ext in [".mp4", ".avi", ".mov"])
        ]
        video_files.sort(key=lambda x: int("".join(filter(str.isdigit, x))))

        for count, video_file in enumerate(video_files, 1):
            print("=" * 20)
            full_path = os.path.join(video_path, video_file)
            print(f"Processing clip {count}: {full_path}")

            base64_video, base64_frames, base64_audio = process_video_clip(
                full_path, 0, interval_seconds, fps, audio_format="wav"
            )

            if base64_frames:
                process_segment(
                    video_graph,
                    base64_video,
                    base64_frames,
                    base64_audio,
                )

            if segment_limit is not None and count >= segment_limit:
                break

In [3]:
video_graph = VideoGraph()
# video_path = "/mnt/bn/videonasi18n/longlin.kylin/vlm-agent-benchmarking/data/videos/raw/720p/5 Poor People vs 1 Secret Millionaire.mp4"
video_path = "data/videos/clipped/5 Poor People vs 1 Secret Millionaire"

streaming_process_video(
    video_graph, video_path, interval_seconds=60, fps=5, segment_limit=1
)

Processing clip 1: data/videos/clipped/5 Poor People vs 1 Secret Millionaire/1.mp4
{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso2avc1mp41', 'encoder': 'Lavf61.1.100'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1280, 720], 'bitrate': 1044, 'fps': 23.976023976023978, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'Metadata': '', 'handler_name': 'VideoHandler', 'vendor_id': '[0][0][0][0]', 'encoder': 'Lavc61.3.100 libx264'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 44100, 'bitrate': 129, 'metadata': {'Metadata': '', 'handler_name': 'SoundHandler', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 30.0, 'bitrate': 1179, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 

{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso2avc1mp41', 'encoder': 'Lavf61.1.100'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1280, 720], 'bitrate': 1044, 'fps': 23.976023976023978, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'Metadata': '', 'handler_name': 'VideoHandler', 'vendor_id': '[0][0][0][0]', 'encoder': 'Lavc61.3.100 libx264'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 44100, 'bitrate': 129, 'metadata': {'Metadata': '', 'handler_name': 'SoundHandler', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 30.0, 'bitrate': 1179, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile': '(High)', 'video_size': [1280, 720], 'video_bitrate': 1044, 'video

2025-03-22 12:02:38,390 - httpx - INFO - HTTP Request: POST https://search-va.byteintl.net/gpt/openapi/online/multimodal/crawl/openai/deployments/gemini-1.5-pro-002/chat/completions?api-version=2024-03-01-preview "HTTP/1.1 200 OK"
2025-03-22 12:02:38,438 - root - ERROR - [laplace] matx_inference ex
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/laplace/client/rpc_func.py", line 310, in __call__
    rsp = self._rpc_client.matx_inference(req)
  File "/usr/local/lib/python3.9/dist-packages/euler/client.py", line 344, in call
    return self._compose_middlewares(ctx, fn)(*args, **kwargs)
  File "/usr/local/lib/python3.9/dist-packages/euler/client.py", line 613, in new_next
    return middleware(ctx, *args, **kwargs)
  File "/usr/local/lib/python3.9/dist-packages/euler/client.py", line 62, in close_connection_middleware
    return ctx.next(*args, **kwargs)
  File "/usr/local/lib/python3.9/dist-packages/euler/client.py", line 613, in new_next
    return mid

[{'start_time': '00:00', 'end_time': '00:07', 'speaker': '<speaker_1>', 'asr': "We got Danny, Herm, Aaron, and JC, and five people who say they're a millionaire, but only one of them is, and the other four are lying.", 'duration': 7}, {'start_time': '00:07', 'end_time': '00:10', 'speaker': '<speaker_1>', 'asr': "Let's just start by just appearances.", 'duration': 3}, {'start_time': '00:11', 'end_time': '00:13', 'speaker': '<speaker_1>', 'asr': "You're broke.", 'duration': 2}, {'start_time': '00:14', 'end_time': '00:20', 'speaker': '<speaker_2>', 'asr': "Nah, bro. That's not right. That's not right, bro.", 'duration': 6}, {'start_time': '00:20', 'end_time': '00:28', 'speaker': '<speaker_1>', 'asr': "'Cause you was in the guest of Black Presidents, and I think we had to pay him. Number five, your shoes are elite.", 'duration': 8}, {'start_time': '00:28', 'end_time': '00:37', 'speaker': '<speaker_3>', 'asr': "When a white man doesn't wear socks, that means very comfortable with", 'duratio

RPCError: code: -1, message: Traceback (most recent call last):
  File "/laplace/python/euler_server.py", line 99, in matx_inference
    output = endpoint_handler(feed_dict)
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
  File "/qs_service/model_compiled/audio_embedding/handler.py", line 63, in __call__
    completion = self.generate(wav.decode("utf-8"))
  File "/usr/local/lib/python3.9/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
  File "/qs_service/model_compiled/audio_embedding/handler.py", line 54, in generate
    emb = self.get_embedding(wav_file)
  File "/qs_service/model_compiled/audio_embedding/handler.py", line 48, in get_embedding
    return compute_embedding(wav)
  File "/qs_service/model_compiled/audio_embedding/handler.py", line 42, in compute_embedding
    wav = load_wav(wav_file)
  File "/qs_service/model_compiled/audio_embedding/handler.py", line 32, in load_wav
    wav, fs = torchaudio.load(wav_file)
  File "/usr/local/lib/python3.9/dist-packages/torchaudio/_backend/utils.py", line 205, in load
    return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size)
  File "/usr/local/lib/python3.9/dist-packages/torchaudio/_backend/soundfile.py", line 27, in load
    return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format)
  File "/usr/local/lib/python3.9/dist-packages/torchaudio/_backend/soundfile_backend.py", line 221, in load
    with soundfile.SoundFile(filepath, "r") as file_:
  File "/usr/local/lib/python3.9/dist-packages/soundfile.py", line 690, in __init__
    self._file = self._open(file, mode_int, closefd)
  File "/usr/local/lib/python3.9/dist-packages/soundfile.py", line 1265, in _open
    raise LibsndfileError(err, prefix="Error opening {0!r}: ".format(self.name))
soundfile.LibsndfileError: Error opening <_io.BytesIO object at 0x7f803cdfaea0>: Format not recognised.
