In [1]:
import numpy as np
from tqdm import tqdm
import os
import json

from videograph import VideoGraph
from utils.general import *
from utils.video_processing import *
from utils.chat_api import *
from prompts import *

from face_processing import process_faces
from voice_processing import process_voices
from memory_processing import (
    process_captions,
    generate_captions_and_thinkings_with_ids,
)



In [None]:
def process_segment(video_graph, base64_video, base64_frames, base64_audio):

    id2voices = process_voices(video_graph, base64_audio, base64_video)
    print("Finish processing voices")

    print(f"processing {len(base64_frames)} frames...")

    id2faces = process_faces(video_graph, base64_frames)
    # print(id2faces.keys())
    print("Finish processing faces")

    episodic_captions, semantic_captions = generate_captions_and_thinkings_with_ids(
        video_graph,
        base64_video,
        base64_frames,
        base64_audio,
        id2faces,
        id2voices,
    )

    process_captions(video_graph, episodic_captions, type="episodic")
    process_captions(video_graph, semantic_captions, type="semantic")

    print("Finish processing segment")


def streaming_process_video(
    video_graph, video_path, interval_seconds, fps, segment_limit=None
):
    """Process video segments at specified intervals with given fps.

    Args:
        video_graph (VideoGraph): Graph object to store video information
        video_path (str): Path to the video file or directory containing clips
        interval_seconds (float): Time interval between segments in seconds
        fps (float): Frames per second to extract from each segment

    Returns:
        None: Updates video_graph in place with processed segments
    """
    if os.path.isfile(video_path):
        # Process single video file
        video_info = get_video_info(video_path)
        print(video_info)

        # Process each interval
        count = 0
        for start_time in np.arange(0, video_info["duration"], interval_seconds):
            if start_time + interval_seconds > video_info["duration"]:
                break

            print("=" * 20)
            count += 1

            print(f"Loading {count}-th clip starting at {start_time} seconds...")
            base64_video, base64_frames, base64_audio = process_video_clip(
                video_path, start_time, interval_seconds, fps, audio_format="wav"
            )

            # check dtype
            # print(type(base64_video), type(base64_frames[0]), type(base64_audio))

            # Process frames for this interval
            if base64_frames:
                print(
                    f"Starting processing {count}-th clip starting at {start_time} seconds..."
                )
                process_segment(
                    video_graph,
                    base64_video,
                    base64_frames,
                    base64_audio,
                )

            if segment_limit is not None and count >= segment_limit:
                break

    elif os.path.isdir(video_path):
        # Process directory of numbered clips
        files = os.listdir(video_path)
        # Filter for video files and sort by numeric value in filename
        video_files = [
            f for f in files if any(f.endswith(ext) for ext in [".mp4", ".avi", ".mov"])
        ]
        video_files.sort(key=lambda x: int("".join(filter(str.isdigit, x))))

        for count, video_file in enumerate(video_files, 1):
            print("=" * 20)
            full_path = os.path.join(video_path, video_file)
            print(f"Processing clip {count}: {full_path}")

            base64_video, base64_frames, base64_audio = process_video_clip(
                full_path, 0, None, fps, audio_format="wav"
            )

            if base64_frames:
                process_segment(
                    video_graph,
                    base64_video,
                    base64_frames,
                    base64_audio,
                )

            if segment_limit is not None and count >= segment_limit:
                break

In [None]:
processing_config = json.load(open("configs/processing_config.json"))
memory_config = json.load(open("configs/memory_config.json"))
# video paths can be paths to directories or paths to mp4 files
video_paths = processing_config["video_paths"]

for video_path in video_paths:

    video_graph = VideoGraph(**memory_config)

    streaming_process_video(
        video_graph,
        video_path,
        processing_config["interval_seconds"],
        processing_config["fps"],
        processing_config["segment_limit"],
    )

    save_dir = "data/video_graphs"
    save_video_graph(
        video_graph, video_path, save_dir, (processing_config, memory_config)
    )

In [2]:
video_graph_path = "data/video_graphs/5-Poor-People-vs-1-Secret-Millionaire_60_5_5_10_20_0.3_0.6_0.75.pkl"
video_graph = load_video_graph(video_graph_path)
# for text_node in video_graph.text_nodes:
#     print(video_graph.nodes[text_node].metadata['contents'])
# for nodes, weight in video_graph.edges.items():
#     if weight > 1:
#         if video_graph.nodes[nodes[0]].type in ["episodic", "semantic"]:
#            print(video_graph.nodes[nodes[0]].metadata['contents'])
#         else:
#            print(video_graph.nodes[nodes[1]].metadata['contents'])
#         print(weight)

equivalences = video_graph.extract_equivalences()
print(equivalences)
           
# video_graph.summarize(logging=True)
# save_dir = "data/video_graphs"
# save_video_graph(
#     video_graph, None, save_dir, None, file_name='5-Poor-People-vs-1-Secret-Millionaire_60_5_5_10_20_0.3_0.6_0.75_augmented.pkl'
# )
# video_graph.visualize()

Loading video graph from data/video_graphs/5-Poor-People-vs-1-Secret-Millionaire_60_5_5_10_20_0.3_0.6_0.75.pkl
Cluster 0 has 1 nodes: [36]
--------------------------------------------------------------------------------
Node 36 [edge weight]: 1.0
Node 36 [content]: Equivalence: <face_3>, <voice_0>
Generating equivalences 0 times


2025-03-28 03:57:49,590 - httpx - INFO - HTTP Request: POST https://search-va.byteintl.net/gpt/openapi/online/v2/crawl/openai/deployments/gpt-4o-2024-11-20/chat/completions?api-version=2024-03-01-preview "HTTP/1.1 200 OK"


Cluster 0 has 1 nodes: [18]
--------------------------------------------------------------------------------
Node 18 [edge weight]: 1.0
Node 18 [content]: Equivalence: <face_9>, <voice_1>
Generating equivalences 0 times


2025-03-28 03:57:50,065 - httpx - INFO - HTTP Request: POST https://search-va.byteintl.net/gpt/openapi/online/v2/crawl/openai/deployments/gpt-4o-2024-11-20/chat/completions?api-version=2024-03-01-preview "HTTP/1.1 200 OK"


Cluster 0 has 2 nodes: [17, 35]
--------------------------------------------------------------------------------
Node 17 [edge weight]: 1.0
Node 17 [content]: Equivalence: <face_3>, <voice_2>
--------------------------------------------------------------------------------
Node 35 [edge weight]: 1.0
Node 35 [content]: Equivalence: <face_6>, <voice_2>
********************************************************************************
Cluster 0 has 1 nodes after filtering: [17]
--------------------------------------------------------------------------------
Node 17 [edge weight]: 1.0
Node 17 [content]: Equivalence: <face_3>, <voice_2>
Generating equivalences 0 times


2025-03-28 03:57:50,652 - httpx - INFO - HTTP Request: POST https://search-va.byteintl.net/gpt/openapi/online/v2/crawl/openai/deployments/gpt-4o-2024-11-20/chat/completions?api-version=2024-03-01-preview "HTTP/1.1 200 OK"


Cluster 0 has 1 nodes: [59]
--------------------------------------------------------------------------------
Node 59 [edge weight]: 1.0
Node 59 [content]: Equivalence: <face_9>, <voice_44>.
Generating equivalences 0 times


2025-03-28 03:57:51,532 - httpx - INFO - HTTP Request: POST https://search-va.byteintl.net/gpt/openapi/online/v2/crawl/openai/deployments/gpt-4o-2024-11-20/chat/completions?api-version=2024-03-01-preview "HTTP/1.1 200 OK"


Cluster 0 has 1 nodes: [75]
--------------------------------------------------------------------------------
Node 75 [edge weight]: 1.0
Node 75 [content]: Equivalence: <face_7>, <voice_64>
Generating equivalences 0 times


2025-03-28 03:57:52,239 - httpx - INFO - HTTP Request: POST https://search-va.byteintl.net/gpt/openapi/online/v2/crawl/openai/deployments/gpt-4o-2024-11-20/chat/completions?api-version=2024-03-01-preview "HTTP/1.1 200 OK"


Cluster 0 has 1 nodes: [92]
--------------------------------------------------------------------------------
Node 92 [edge weight]: 1.0
Node 92 [content]: Equivalence: <face_22>, <voice_66>
Generating equivalences 0 times


2025-03-28 03:57:52,759 - httpx - INFO - HTTP Request: POST https://search-va.byteintl.net/gpt/openapi/online/v2/crawl/openai/deployments/gpt-4o-2024-11-20/chat/completions?api-version=2024-03-01-preview "HTTP/1.1 200 OK"


Cluster 0 has 1 nodes: [76]
--------------------------------------------------------------------------------
Node 76 [edge weight]: 3.0
Node 76 [content]: Equivalence: <face_8>, <voice_67>
Generating equivalences 0 times


2025-03-28 03:57:53,270 - httpx - INFO - HTTP Request: POST https://search-va.byteintl.net/gpt/openapi/online/v2/crawl/openai/deployments/gpt-4o-2024-11-20/chat/completions?api-version=2024-03-01-preview "HTTP/1.1 200 OK"


Cluster 0 has 1 nodes: [91]
--------------------------------------------------------------------------------
Node 91 [edge weight]: 1.0
Node 91 [content]: Equivalence: <face_10>, <voice_82>
Generating equivalences 0 times


2025-03-28 03:57:56,503 - httpx - INFO - HTTP Request: POST https://search-va.byteintl.net/gpt/openapi/online/v2/crawl/openai/deployments/gpt-4o-2024-11-20/chat/completions?api-version=2024-03-01-preview "HTTP/1.1 200 OK"


{'character_0': [3, 0, 2], 'character_1': [9, 1, 44], 'character_2': [7, 64], 'character_3': [22, 66], 'character_4': [8, 67], 'character_5': [10, 82]}


In [3]:
for text_node in video_graph.text_nodes:
    print(video_graph.nodes[text_node].metadata['contents'])

['<voice_0> introduces four individuals named Denny, Herm, Aaron, and JC, who are seated at a table.']
['<voice_0> mentions five other individuals who claim to be millionaires, but only one of them is telling the truth.']
['<voice_1> suggests starting the evaluation based on appearances.']
['<face_9>, wearing a black jacket and jeans, is identified by <voice_2> as not being a millionaire.']
['<voice_2> states that <face_4>, because they were a guest of another person, had to be paid.']
["<voice_2> comments that <face_8>'s shoes look expensive."]
['Equivalence: <face_3>, <voice_2>']
['Equivalence: <face_9>, <voice_1>']
['<face_9> is perceived as not wealthy due to their attire.']
['<face_4> may have received payment for their participation.']
['<face_8> is perceived as wealthy due to their expensive footwear.']
['<face_6> sits at the table, wearing a black hoodie.']
['<face_22> sits at the table, wearing a blue Drexel University hoodie and glasses.']
['<face_3> sits at the table, wearin

In [None]:
video_graph.visualize()

In [None]:
# from retrieve import retrieve_from_videograph
# from videograph import VideoGraph
# from utils.chat_api import (
#     generate_messages,
#     get_response_with_retry,
#     parallel_get_embedding,
# )
# from utils.general import validate_and_fix_python_list
# from prompts import prompt_memory_retrieval

# MAX_RETRIES = 3


# def generate_queries(question, existing_knowledge=None, query_num=1):
#     input = [
#         {
#             "type": "text",
#             "content": prompt_memory_retrieval.format(
#                 question=question,
#                 query_num=query_num,
#                 existing_knowledge=existing_knowledge,
#             ),
#         }
#     ]
#     messages = generate_messages(input)
#     model = "gpt-4o-2024-11-20"
#     queries = None
#     for i in range(MAX_RETRIES):
#         print(f"Generating queries {i} times")
#         queries = get_response_with_retry(model, messages)[0]
#         queries = validate_and_fix_python_list(queries)
#         if queries is not None:
#             break
#     if queries is None:
#         raise Exception("Failed to generate queries")
#     return queries


# def retrieve_from_videograph(videograph, question, topk=3):
#     queries = generate_queries(question)
#     print(f"Queries: {queries}")

#     model = "text-embedding-3-large"
#     query_embeddings = parallel_get_embedding(model, queries)[0]

#     related_nodes = []

#     for query_embedding in query_embeddings:
#         nodes = videograph.search_text_nodes(query_embedding)
#         related_nodes.extend(nodes)

#     related_nodes = list(set(related_nodes))
#     return related_nodes


# question = "Denny"
# retrieved_nodes = retrieve_from_videograph(video_graph, question)
# print(retrieved_nodes)