#### Подготовка окружения (импорт библиотек)

In [1]:
import json
from pathlib import Path

import cv2
import numpy as np
import torch
import torchvision.transforms as T

from scipy.spatial.distance import cosine
from sklearn.cluster import DBSCAN
from torchvision.models import resnet50, ResNet50_Weights
from tqdm.notebook import tqdm

#### Конфигурация параметров

In [3]:
# Configuration
BASE_DIR = Path().resolve()                         # Root directory of the project
TRAIN_VIDEO_DIR = BASE_DIR / "data_train_short/"    # directory with train .mp4 files named as <video_id>.mp4
TEST_VIDEO_DIR = BASE_DIR / "data_test_short/"      # directory with test  .mp4 files named as <video_id>.mp4
TRAIN_JSON = BASE_DIR / "train_labels.json"         # json mapping train video_id -> {"start", "end"}
TEST_JSON = BASE_DIR / "test_labels.json"           # json mapping test  video_id -> {"start", "end"}
FRAME_RATE = 2                                      # fps for sampling
WINDOW_SEC = 15                                     # window length in seconds
STRIDE_SEC = 1                                      # stride in seconds
EPS = 0.5                                           # DBSCAN eps for clustering
MIN_SAMPLES = 5                                     # DBSCAN min_samples

In [4]:
# Device selection: MPS > CUDA > CPU
if torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
elif torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")

print("Model use device:", DEVICE)

Model use device: mps


#### Инициализация модели и трансформаций

In [5]:
# Select ResNet50 pretrained model
torch.set_grad_enabled(False)                       # Disable gradients globally for inference
model = resnet50(weights=ResNet50_Weights.DEFAULT)
model.fc = torch.nn.Identity()                      # Remove final classification layer
model.to(DEVICE)
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [6]:
# Image preprocessing pipeline
transform = T.Compose([
    T.ToPILImage(),
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

#### Извлечение кадров и признаков, подготовка окна

In [7]:
# Convert time string to seconds
def time_to_seconds(t_str):
    parts = list(map(int, t_str.split(':')))
    if len(parts) == 3:
        h, m, s = parts
        return h * 3600 + m * 60 + s
    elif len(parts) == 2:
        m, s = parts
        return m * 60 + s
    else:
        raise ValueError(f"Invalid time format: {t_str}")


In [8]:
# Extract frames at a given sampling rate
def extract_frames(video_path: Path, rate: int = FRAME_RATE) -> list[np.ndarray]:
    cap = cv2.VideoCapture(str(video_path))
    fps = cap.get(cv2.CAP_PROP_FPS) or rate
    step = max(int(fps / rate), 1)
    frames = []
    idx = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if idx % step == 0:
            frames.append(frame)
        idx += 1
    cap.release()
    return frames

In [9]:
# Extract feature vectors for a list of frames
def extract_features(frames: list[np.ndarray]) -> np.ndarray:
    feats = []
    for img in frames:
        tensor = transform(img).unsqueeze(0).to(DEVICE)
        feature = model(tensor).cpu().numpy().ravel()
        feats.append(feature)
    return np.vstack(feats)

In [10]:
# Generate sliding windows of feature vectors
def sliding_windows(feats: np.ndarray, window_size: int, stride: int) -> list[tuple[int, np.ndarray]]:
    n_frames = feats.shape[0]
    return [
        (start, feats[start:start + window_size].mean(axis=0))
        for start in range(0, n_frames - window_size + 1, stride)
    ]


#### Построение шаблона на обучающей выборке

In [None]:
# Build template from training data
with open(TRAIN_JSON, 'r') as f:
    train_labels = json.load(f)

train_labels = dict(list(train_labels.items())[:15])

all_feats, ids = [], []
window_size = int(WINDOW_SEC * FRAME_RATE)
stride = int(STRIDE_SEC * FRAME_RATE)

for vid in tqdm(train_labels, desc="Building template"):
    video_path = TRAIN_VIDEO_DIR / vid / f"{vid}.mp4"
    frames = extract_frames(video_path)
    feats = extract_features(frames)
    for start, vec in sliding_windows(feats, window_size, stride):
        all_feats.append(vec)
        ids.append(vid)

all_feats = np.vstack(all_feats)
labels = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES).fit_predict(all_feats)

# Select the largest cluster as template
from collections import Counter
counter = Counter(labels[labels >= 0])
best_label = counter.most_common(1)[0][0]
template = all_feats[labels == best_label].mean(axis=0)

Building template:   0%|          | 0/15 [00:00<?, ?it/s]

#### Делаем предсказания на test

In [11]:
# Detect event segments in test videos
with open(TEST_JSON, 'r') as f:
    test_labels = json.load(f)

train_labels = dict(list(train_labels.items())[:5])

preds = {}
for vid, info in tqdm(test_labels.items(), desc="Detecting events"):
    video_file = TEST_VIDEO_DIR / vid / f"{vid}.mp4"
    frames = extract_frames(video_file)
    feats = extract_features(frames)
    best_start, best_dist = 0, float('inf')
    for start, vec in sliding_windows(feats, window_size, stride):
        dist = cosine(template, vec)
        if dist < best_dist:
            best_start, best_dist = start, dist
    preds[vid] = {
        'start': best_start / FRAME_RATE,
        'end': (best_start + window_size) / FRAME_RATE,
    }

Test videos:   0%|          | 0/45 [00:00<?, ?it/s]

[h264 @ 0x355bc7780] Invalid NAL unit size (8247 > 2791).
[h264 @ 0x355bc7780] missing picture in access unit with size 2795
[h264 @ 0x355be72b0] Invalid NAL unit size (8247 > 2791).
[h264 @ 0x355be72b0] Error splitting the input into NAL units.
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x348ac3a00] stream 0, offset 0x12c01550: partial file
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x348ac3a00] stream 0, offset 0x12c036be: partial file
[h264 @ 0x35497bda0] Invalid NAL unit size (14648 > 13963).
[h264 @ 0x35497bda0] missing picture in access unit with size 13967
[h264 @ 0x367520330] Invalid NAL unit size (14648 > 13963).
[h264 @ 0x367520330] Error splitting the input into NAL units.
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x355b71730] stream 0, offset 0x12c002ad: partial file
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x355b71730] stream 0, offset 0x12c038d8: partial file
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x127ff2d20] stream 1, offset 0x12c00063: partial file
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x127ff2d20] stream 1, offset 0x12c001ac: partial file
[h264 @ 0x3

KeyboardInterrupt: 

#### Вычисление метрик качества

In [None]:
# Evaluation metrics
ios, start_err, end_err = [], [], []
for vid, info in preds.items():
    true_start = time_to_seconds(test_labels[vid]['start'])
    true_end = time_to_seconds(test_labels[vid]['end'])
    pred_start = preds[vid]['start']
    pred_end = preds[vid]['end']

    intersection = max(0, min(pred_end, true_end) - max(pred_start, true_start))
    union = (pred_end - pred_start) + (true_end - true_start) - intersection
    ios.append(intersection / union if union > 0 else 0)
    start_err.append(abs(pred_start - true_start))
    end_err.append(abs(pred_end - true_end))

In [None]:
print(f"Mean IoU:       {np.mean(ios):.3f}")
print(f"Mean start err: {np.mean(start_err):.3f}s")
print(f"Mean end err:   {np.mean(end_err):.3f}s")