In [2]:
import numpy as np
if not hasattr(np, "float"):
    np.float = float
if not hasattr(np, "int"):
    np.int = int
if not hasattr(np, "bool"):
    np.bool = bool
if not hasattr(np, "object"):
    np.object = object
if not hasattr(np, "long"):
    np.long = int

from PIL import Image

if not hasattr(Image, "LINEAR"):
    Image.LINEAR = Image.BILINEAR
if not hasattr(Image, "CUBIC"):
    Image.CUBIC = Image.BICUBIC
if not hasattr(Image, "NEAREST"):
    Image.NEAREST = Image.NEAREST 

import torch.fx

from trackron.models import build_model

  if not hasattr(np, "bool"):
  if not hasattr(np, "object"):
  if not hasattr(np, "long"):
  if not hasattr(Image, "LINEAR"):
  if not hasattr(Image, "CUBIC"):
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from trackron.config import get_cfg
import trackron.config.model_configs as mc

In [4]:
cfg = get_cfg()
cfg = mc.add_utt_config(cfg)


In [13]:
cfg

CfgNode({'VERSION': 2, 'MODEL': CfgNode({'META_ARCHITECTURE': 'DiMPNet', 'DEVICE': 'cuda', 'WEIGHTS': '', 'PIXEL_MEAN': [0.485, 0.456, 0.406], 'PIXEL_STD': [0.229, 0.224, 0.225], 'POSITION_EMBEDDING': 'sine', 'HIDDEN_DIM': 512, 'BACKBONE': CfgNode({'NAME': 'resnet50', 'PRETRAIN': False, 'OUTPUT_LAYERS': ['layer2', 'layer3', 'layer4'], 'CLS_LAYERS': ['layer3'], 'STRIDE': 16, 'DILATION': False, 'NORM': 'BN', 'USE_POSITION': False, 'FROZEN_STAGES': -1}), 'NUM_CLASS': 1, 'FEATURE_LAYERS': ['layer2', 'layer3', 'layer4'], 'NUM_FEATURE_LAYERS': 4, 'FEATURE_DIM': 256, 'OBJECT_SIZE': 1, 'NUM_QUERIES': 500, 'TWO_STAGE': False, 'BOX_REFINE': True, 'NORM': 'BN', 'ENCODER': CfgNode({'NUM_LAYERS': 6, 'NORM': 'relu', 'HEADS': 8, 'DROPOUT': 0.1, 'DIM_FEEDFORWARD': 1024, 'NUM_POINTS': 4}), 'DECODER': CfgNode({'NUM_LAYERS': 6, 'NORM': 'relu', 'HEADS': 8, 'DROPOUT': 0.1, 'DIM_FEEDFORWARD': 1024, 'PRE_NORM': False, 'NUM_POINTS': 4}), 'BOX_HEAD': CfgNode({'NAME': 'MLP', 'REFINE': True, 'PATCH_DIM': 484, 'I

In [5]:
cfg.merge_from_file("/src/trackron/configs/utt/utt.yaml", allow_unsafe=True)



In [26]:
cfg.SOT.DATASET.ROOT

'./data'

In [6]:
cfg.freeze()

In [7]:
import cv2
from contextlib import contextmanager
from datetime import datetime, time, timedelta

@contextmanager
def open_video_capture(*args, **kwargs):
    """VideoCapture用のコンテキストマネージャー
    Args:
        *args: VideoCaptureのコンストラクタに渡す引数リスト
        **kwargs: VideoCaptureのコンストラクタに渡すキーワード引数
    Returns:
        contextmanager
    """
    cap = cv2.VideoCapture(*args, **kwargs)
    try:
        if not cap.isOpened():
            raise ValueError("Failed to open video source")
        yield cap
    finally:
        cap.release()

def valut_to_time(**kwargs) -> time:
    """数値のdatetime.timeのオブジェクトに変換
    Args:
        kwargs: timedeltaのコンストラクタ準拠(keyword only)

    Returns:
        time: 変換されたtimeオブジェクト
    """
    td = timedelta(**kwargs)
    return (datetime.min + td).time()

def read_image_set(cap: cv2.VideoCapture) -> tuple[bool, cv2.typing.MatLike, time, int]:
    """VideoCaptureのreadのラッパー.通常の戻り値+timestamp,frame数を返す。
    Args:
        cap: opencvのvideocaptureインスタンス
    Returns:
        bool: フレームが取得されたかどうか(falseの場合、失敗空の画像).
        MatLink: 1フレーム単位の画像.
        datetime.time: 取得時のtimestamp.
        int: 現在のframe数.
    """
    ret, img = cap.read()

    milliseconds = int(cap.get(cv2.CAP_PROP_POS_MSEC))
    timestamp = valut_to_time(milliseconds=milliseconds)
    if milliseconds < 0:
        timestamp = datetime.now().time()

    frame_count = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
    return ret, img, timestamp, frame_count

def set_frame_pos(cap: cv2.VideoCapture, index: int) -> bool:
    """指定したフレーム位置に移動
    Args:
        cap (VideoCapture): 動画ポインタ
        index (int): 指定フレーム位置
    """
    return cap.set(cv2.CAP_PROP_POS_FRAMES, index)

In [8]:
video_path = "145599_640x360.mp4"
with open_video_capture(video_path) as cap:
    success, tmpl_bgr, tmpl_ts, _ = read_image_set(cap)
    success, search_bgr, search_ts, _ = read_image_set(cap)

In [39]:
tmpl_bgr.shape

(360, 640, 3)

In [9]:
def to_chw_tensor(img_bgr, mean, std):
    img = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB).astype(np.float32)/255.0
    img = (img - mean) / std
    return torch.from_numpy(img.transpose(2,0,1))  # (C,H,W) float32

In [10]:
mean = np.array(cfg.MODEL.PIXEL_MEAN, dtype=np.float32)
std  = np.array(cfg.MODEL.PIXEL_STD,  dtype=np.float32)

chw_tmpl_bgr = to_chw_tensor(tmpl_bgr, mean, std)
chw_search_bgr = to_chw_tensor(search_bgr, mean, std)

In [11]:
batched_inputs = [{
    # UTT/SOT向けの典型：テンプレートとサーチを別キーで
    "template_images": [tmpl_bgr],      # 長さは cfg.SOT.TEMPLATE.FRAMES
    "search_images":   [search_bgr],    # 長さは cfg.SOT.SEARCH.FRAMES
    # 参照フレームのターゲットボックス
    "ref_boxes": np.zeros((1, 4)),  # (N,4) ここでは N=1
    # 後段の後処理に使うことがあるので高さ/幅も持たせておく
    "height": search_bgr.shape[0],
    "width":  search_bgr.shape[1],
}]

In [12]:
data = [
    {
        "template_images": torch.randn(3, 256, 256),
        "template_boxes": torch.tensor([[50, 60, 120, 150]]),
        "template_labels": torch.tensor([1]),
        "search_images": torch.randn(3, 256, 256),
        "search_boxes": torch.tensor([[55, 65, 125, 155]]),
        "search_labels": torch.tensor([1]),
        "matched_indices": {0: 0}, 
    }
]

In [14]:
import torch

model = build_model(cfg)
model.eval()
with torch.no_grad():
    ret = model(data, mode="mot")

  max_size = (max_size + (stride - 1)) // stride * stride
  dim_t = self.temperature**(2 * (dim_t // 2) / self.num_pos_feats)


In [15]:
ret

({'DetLoss/loss_ce': tensor(1.1799, device='cuda:0'),
  'DetLoss/loss_bbox': tensor(0.2066, device='cuda:0'),
  'DetLoss/loss_giou': tensor(0.3806, device='cuda:0'),
  'DetLoss/loss_ce_0': tensor(0.9082, device='cuda:0'),
  'DetLoss/loss_bbox_0': tensor(0.3347, device='cuda:0'),
  'DetLoss/loss_giou_0': tensor(0.5397, device='cuda:0'),
  'DetLoss/loss_ce_1': tensor(1.0260, device='cuda:0'),
  'DetLoss/loss_bbox_1': tensor(0.2443, device='cuda:0'),
  'DetLoss/loss_giou_1': tensor(0.4698, device='cuda:0'),
  'DetLoss/loss_ce_2': tensor(1.1054, device='cuda:0'),
  'DetLoss/loss_bbox_2': tensor(0.2034, device='cuda:0'),
  'DetLoss/loss_giou_2': tensor(0.4191, device='cuda:0'),
  'DetLoss/loss_ce_3': tensor(1.1454, device='cuda:0'),
  'DetLoss/loss_bbox_3': tensor(0.2052, device='cuda:0'),
  'DetLoss/loss_giou_3': tensor(0.3728, device='cuda:0'),
  'DetLoss/loss_ce_4': tensor(1.1632, device='cuda:0'),
  'DetLoss/loss_bbox_4': tensor(0.2101, device='cuda:0'),
  'DetLoss/loss_giou_4': tensor(