# Video Cut Tool

Building on the model's temporal grounding and reasoning capabilities, we have further expanded its tool-use capabilities in video reasoning and designed the **VideoCut** tool. During inference, the model can **replay certain clips** to clearly watch details in the video, thereby improving reasoning accuracy.

During the inference process, the model outputs the **start and end times of clips** to be rewatched (supporting multiple clips) and the desired **slowdown frame rate** (currently supporting 1-5 FPS). VideoCut resamples the video clips based on the start/end times and frame rate, enabling the model to clearly perceive details.

### 0. Setup the environment
Import necessary libraries and set up API access and client configuration for model inference.


In [None]:
pip install -r ../requirements.txt

In [2]:
# Copyright 2025 Bytedance Ltd. and/or its affiliates.
# SPDX-License-Identifier: Apache-2.0

# Please set the API key here
import os

os.environ['ARK_API_KEY']  = 'your_ark_api_key'
os.environ['ARK_MODEL_ENDPOINT'] = "doubao-seed-1-8-251215"

In [4]:
import os
import re
import json
import base64
import shutil


import cv2
import numpy as np
from openai import OpenAI

from video_processing import process_video, sample_frames_from_video_bytes

client = OpenAI(
    base_url="https://ark.cn-beijing.volces.com/api/v3",
    api_key=os.environ.get("ARK_API_KEY"),
)

### 1. VideoCut Function Definition
Define the tool that clips and resamples segments for clearer perception.

In [5]:
def videocut(
    video: str | bytes,
    timestamps: str,
    fps: int = 1,
    video_max_sequence_length: int = 32768,
    token_sets: tuple = (32, 64, 96, 128, 160, 192, 224, 256),
) -> dict:
    """
    "name": "VIDEOCUT",
    "description": "Clips a video segment based on precise timestamps and FPS parameters for clearer playback.",
    "parameters": {
        "type": "object",
        "properties": {
            "timestamps": {
                "type": "string",
                "description": "The timestamps (two float numbers indicate start and end seconds) determines where to cut the video. It is a string of the form 's1 - s2' seconds. The start_time and end_time are floating-point numbers representing the absolute start and end times of the segment in seconds."
            },
            "fps": {
                "type": "number",
                "description": "An integer number from 1 to 5, representing the sampling FPS for video clip. If not provided, the video will be sampled by 1 FPS by default."
            }
        },
        "required": [
            "timestamps"
        ],
        "additionalProperties": False
    }
    """
    if isinstance(video, str):
        with open(video, 'rb') as f:
            video_bytes = f.read()
    elif isinstance(video, bytes):
        video_bytes = video
    else:
        raise ValueError('Invalid video type. Must be str or bytes.')

    # check timestamps, multiple timestamps or single timestamps
    timestamps = timestamps.split(',')
    timestamps = [m.strip() for m in timestamps]
    output_frame_bytes = []
    output_timestamps = []

    # cut video clips from timestamps
    sampling_segments = []
    num_frames_estimated = 0
    for orig_timestamp in timestamps:
        pattern = r'\d+(?:\.\d+)?'
        timestamp = re.findall(pattern, orig_timestamp)
        timestamp = [float(m) for m in timestamp]
        sample_fps = int(fps)
        if len(timestamp) != 2:
            raise ValueError('Invalid segment format. Must be s1 - s2 seconds. {}'.format(orig_timestamp))
        start_sec, end_sec = timestamp
        sampling_segments.append((start_sec, end_sec))
        num_frames_estimated += int((end_sec - start_sec) * sample_fps)

    max_frame_token = token_sets[0]
    for token in token_sets[1:]:
        if token * num_frames_estimated <= video_max_sequence_length:
            max_frame_token = token

    for start_sec, end_sec in sampling_segments:
        sampled_frames, sampled_timestamps = sample_frames_from_video_bytes(
            video_bytes,
            start_sec,
            end_sec,
            sample_fps,
            tokens_per_image=max_frame_token,
        )
        output_frame_bytes.extend(sampled_frames)
        output_timestamps.extend(sampled_timestamps)

    return output_frame_bytes, output_timestamps

Define the tool schema for VideoCut.

In [6]:
tool_schemas = [{
    "type": "function",
    "function": {
        "type": "function",
        "name": "VIDEOCUT",
        "description": "Clips a video segment based on precise timestamps and FPS parameters for clearer playback.",
        "parameters": {
            "type": "object",
            "properties": {
                "timestamps": {
                    "type": "string",
                    "description": "The timestamps (two float numbers indicate start and end seconds) determines where to cut the video. It is a string of the form 's1 - s2' seconds. The start_time and end_time are floating-point numbers representing the absolute start and end times of the segment in seconds."
                },
                "fps": {
                    "type": "number",
                    "description": "An integer number from 1 to 5, representing the sampling FPS for video clip. If not provided, the video will be sampled by 1 FPS by default."
                }
            },
            "required": [
                "timestamps"
            ]
        }
    }
}]

### 2. Preprocess Video
Sample initial frames from the full video to bootstrap understanding.

In [7]:
def preprocess_video(video_path,
                     sampling_fps=1,
                     max_frames=1280,
                     max_video_length=81920):
    if not os.path.exists(video_path):
        raise ValueError('Video file does not exist.')
    with open(video_path, 'rb') as f:
        video_bytes = f.read()

    video_base64_list, timestamps, _ = process_video(video_bytes, sampling_fps,
                                                     max_frames,
                                                     max_video_length)
    return video_base64_list, timestamps

### 3. Construct Video Message
Compose a user message with sampled frames and timestamps for inference.

In [8]:
def construct_video_message(prompt, video_frames, video_timestamps):
    """
    Construct a message for video understanding.
    """
    video_contents = []
    for image_bytes, timestamp in zip(video_frames, video_timestamps):
        video_contents.append({
            "type": "text",
            "text": f'[{round(timestamp, 1)} second]'
        })
        video_contents.append({
            "type": "image_url",
            "image_url": f"data:image/jpeg;base64,{image_bytes}",
        })
    contents = video_contents + [{"type": "text", "text": prompt}]
    message = [{"role": "user", "content": contents}]
    return message

In [9]:
def api_complete(client, messages, with_tool=False):
    if with_tool and messages[0]['role'] == 'user':
        # add hint
        messages.insert(
            0, {
                "role":
                "system",
                "content":
                "You can use available tools, such as VIDEOCUT, to help you perceive the video more accurately."
            })
    response = client.chat.completions.create(
        model=os.environ.get('ARK_MODEL_ENDPOINT'),
        messages=messages,
        reasoning_effort="medium",
        tools=tool_schemas if with_tool else None,
        stream=False,
        max_completion_tokens=32768)
    print(response)
    messages.append({
        "role": "assistant",
        "content": "{}".format(response.choices[0].message.content)
    })
    return response.choices[0].message, messages


### 4. Video Understanding with VideoCut
Use a multi-turn loop to improve video analysis. The model can then request to `VIDEOCUT` specific segments at a higher FPS to 'rewatch' moments of interest. This is ideal for tasks requiring detailed observation that might be missed in the initial low-fps view. By feeding these high-detail clips back to the model, you enable it to refine its understanding and provide more accurate answers for complex video questions.

In [10]:
def tool_detect_execute(response, video_path):
    if response.tool_calls is not None and len(response.tool_calls) > 0:
        # one call per turn
        function = response.tool_calls[0].function
        function_name = function.name
        function_args = json.loads(function.arguments)
        if function_name == 'VIDEOCUT':
            timestamps = function_args['timestamps']
            fps = int(function_args.get('fps', 1))
            print("[Call VIDEOCUT] timestamps: {}, fps: {}".format(timestamps, fps))
            with open(video_path, 'rb') as f:
                video_bytes = f.read()
            video_frames, video_timestamps = videocut(video_bytes, timestamps, fps)
            video_contents = []
            for image_bytes, timestamp in zip(video_frames, video_timestamps):
                video_contents.append({
                    "type": "text",
                    "text": f'[{round(timestamp, 1)} second]'
                })
                video_contents.append({
                    "type": "image_url",
                    "image_url": f"data:image/jpeg;base64,{image_bytes}",
                })
            tool_messages = [{
                "role": "tool",
                "tool_call_id": response.tool_calls[0].id,
                "content": video_contents
            }]
            return tool_messages
        else:
            raise ValueError(f'Invalid function name: {function_name}')
    else:
        print("No tool call detected.")
        return None

In [None]:
video_path = "samples/Chaplin_512kb.mp4"
# you may need to download the videofrom https://ia800606.us.archive.org/15/items/CharlieChaplin/Chaplin_512kb.mp4
text_prompts = "To prevent theft, where did Chaplin hide the coat?"
# sampling video frames
sampling_fps = 1
sampled_frames, timestamps = preprocess_video(video_path, max_video_length=49152)
messages = construct_video_message(prompt=text_prompts, video_timestamps=timestamps, video_frames=sampled_frames)
result, history_messages = api_complete(client, messages, with_tool=True)
response = result.content
reasoning_content = result.reasoning_content
print("<think>{}</think>".format(reasoning_content))
tool_message = tool_detect_execute(result, video_path)
if tool_message is not None:
    history_messages.extend(tool_message)
    use_tool = True
else:
    use_tool = False

while use_tool:
    result, history_messages = api_complete(client, history_messages, with_tool=True)
    response = result.content
    reasoning_content = result.reasoning_content
    print("<think>{}</think>".format(reasoning_content))
    tool_message = tool_detect_execute(result, video_path)
    if tool_message is not None:
        history_messages.extend(tool_message)
        use_tool = True
    else:
        use_tool = False
    
print("Seed1.8:\n<think>{}</think>\n{}".format(reasoning_content, response))


[video] 1643.8 second video, sampling 768 frames, 64 tokens per frame
ChatCompletion(id='021765528188121ce5ecf588ca94db1d51bc7611e0e34a2d80012', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageFunctionToolCall(id='call_fzgpz0np8j2b2205cvenykpj', function=Function(arguments='{"timestamps":"1075.9 - 1095.2 seconds","fps":2}', name='VIDEOCUT'), type='function')], reasoning_content="Got it, let's look at the part around 1078.0 to 1093.0 seconds. Wait, actually 1080.1 to 1090.9 seconds, Chaplin hides the coat under the blanket on the bed? No, wait let's use VIDEOCUT on timestamps 1075.9 - 1095.2 seconds with fps 2 to check."))], created=1765528204, model='doubao-seed-1-8-preview-251115', object='chat.completion', service_tier='default', system_fingerprint=None, usage=CompletionUsage(completion_tokens=144, prompt_toke