Я взял data_test_short

Сначала избавимся от неправильно размеченных данных: тех,у которых начало позже конца вставки - я попробовал пройтись по таким сериям руками и понял, что ошибка разметки не одинакова, то есть, например, замена start на end не исправит ситуацию в общем случае, а полный пайплайн предобработки явно за скоупом тестового задания.

In [67]:
import os
import json
import subprocess
import torch
import numpy as np
import cv2
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModel
import torchaudio
from tqdm import tqdm
import collections


def read_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data


data = read_json('/kaggle/input/vk-title-test/data_test_short/labels.json')

valid = []
for video in data:
    if data[video]['start'] < data[video]['end']:
        valid.append(video)

Будем рассматривать только первые две минуты - как правило заставки подобного рода встречаются в самом начале. Данные подтверждают это наблюдение

In [3]:
input_path = '/kaggle/input/vk-title-test/data_test_short'
output_path = '/kaggle/working/cut_videos'

os.makedirs(output_path, exist_ok=True)
max_duration_seconds = 120

for video_name in tqdm(valid, desc="Обрезка видео"):
    video_folder_path = os.path.join(input_path, video_name)
    input_video_path = os.path.join(video_folder_path, f"{video_name}.mp4")
    output_video_path = os.path.join(output_path, f"{video_name}.mp4")
    cmd = [
            'ffmpeg',
            '-y',               
            '-i', input_video_path,    
            '-t', str(max_duration_seconds), 
            '-c', 'copy',       
            output_video_path
        ]
    subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    
print('Done')

Обрезка видео: 100%|██████████| 40/40 [00:28<00:00,  1.42it/s]

Done





In [68]:
WINDOW_SIZE = 5            
FPS_SAMPLE = 1             
N_CLUSTERS = 2         
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [69]:
def extract_window_embeddings(video_path):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames / fps

    waveform, sr = torchaudio.load(video_path)
    if sr != 16000:
        waveform = torchaudio.functional.resample(waveform, sr, 16000)
        sr = 16000
    mono_wave = waveform.mean(dim=0)

    embeddings, timestamps = [], []
    for start in range(0, int(duration) - WINDOW_SIZE + 1):
        frame_imgs = []
        for off in range(WINDOW_SIZE):
            cap.set(cv2.CAP_PROP_POS_FRAMES, int((start + off) * fps))
            ret, frame = cap.read()
            frame_imgs.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        inputs_img = clip_processor(images=frame_imgs, return_tensors='pt').to(DEVICE)
        with torch.no_grad():
            img_feats = clip_model.get_image_features(**inputs_img)
        img_emb = img_feats.mean(dim=0)

        s_sample = start * sr
        e_sample = (start + WINDOW_SIZE) * sr
        audio_np = mono_wave[s_sample:e_sample].cpu().numpy()
        audio_inputs = audio_processor(audio_np, sampling_rate=sr, return_tensors='pt', padding=True)
        audio_inputs = {k: v.to(DEVICE) for k, v in audio_inputs.items()}
        with torch.no_grad():
            aud_out = audio_model(**audio_inputs)
        audio_emb = aud_out.last_hidden_state[:, 0, :].squeeze(0)

        combined = torch.cat([img_emb, audio_emb], dim=0).cpu().numpy()
        embeddings.append(combined)
        timestamps.append(start)

    cap.release()
    return np.vstack(embeddings), timestamps

In [70]:
embeddings_dict = {}
timestamps_dict = {}

INPUT_DIR = output_path

for fn in tqdm(os.listdir(INPUT_DIR), desc='Extract embeddings'):
    if not fn.endswith('.mp4'):
        continue
    video_name = fn[:-4]
    path = os.path.join(INPUT_DIR, fn)
    embs, times = extract_window_embeddings(path)
    embs = normalize(embs, axis=1)
    embeddings_dict[video_name] = embs
    timestamps_dict[video_name] = times

Extract embeddings: 100%|██████████| 40/40 [35:28<00:00, 53.21s/it]


In [71]:
def parse_time_str(t: str) -> int:
    parts = t.split(':')
    parts = [int(num) for num in parts]
    h, m, s = parts
    return h*3600 + m*60 + s

def compute_metrics(gt: dict, pred: dict):
    metrics = {}
    abs_start_errs, abs_end_errs, ious = [], [], []

    for vid, (gt_s, gt_e) in gt.items():
        pred_s, pred_e = pred[vid]
        start_err = abs(pred_s - gt_s)
        end_err = abs(pred_e - gt_e)
        inter_s = max(pred_s, gt_s)
        inter_e = min(pred_e, gt_e)
        inter = max(0, inter_e - inter_s)
        union = max(pred_e, gt_e) - min(pred_s, gt_s)
        iou = inter / union if union > 0 else 0
        metrics[vid] = {
            'start_err': start_err,
            'end_err': end_err,
            'iou': iou
        }
        abs_start_errs.append(start_err)
        abs_end_errs.append(end_err)
        ious.append(iou)

    summary = {
        'mean_start_err': np.mean(abs_start_errs),
        'mean_end_err': np.mean(abs_end_errs),
        'mean_iou': np.mean(ious),
    }
    return metrics, summary

In [None]:
results = {}
for name, embs in embeddings_dict.items():
    pca = PCA(n_components=0.95, svd_solver='full')
    embs_reduced = pca.fit_transform(embs)

    labels = KMeans(n_clusters=N_CLUSTERS).fit_predict(embs_reduced)
    counts = np.bincount(labels)
    intro_lbl = np.argmin(counts)
    idxs = np.where(labels == intro_lbl)[0]

    start_sec = timestamps_dict[name][intro_idxs.min()]
    end_sec = timestamps_dict[name][intro_idxs.max()]
    results[name] = (start_sec, end_sec)

In [76]:
gt = {}
for video in valid:
    gt[video] = (parse_time_str(data[video]['start']), parse_time_str(data[video]['end']))

Как видим большинство серий разметились хорошо, проблемы есть с некоторыми сериалами: например, Баскетс.

In [82]:
per_video, summary = compute_metrics(gt, results)
print("Per-video metrics:")
for vid, m in per_video.items():
    print(f"{vid}: start_err={m['start_err']}s, end_err={m['end_err']}s, IoU={m['iou']:.3f}")
print("Summary:", summary)

Per-video metrics:
-220020068_456249220: start_err=5s, end_err=0s, IoU=0.688
-220020068_456249373: start_err=5s, end_err=0s, IoU=0.688
-220020068_456249231: start_err=5s, end_err=0s, IoU=0.688
-220020068_456255339: start_err=10s, end_err=4s, IoU=0.300
-220020068_456249284: start_err=5s, end_err=0s, IoU=0.688
-220020068_456249192: start_err=5s, end_err=1s, IoU=0.625
-220020068_456249257: start_err=5s, end_err=1s, IoU=0.625
-220020068_456249375: start_err=5s, end_err=1s, IoU=0.625
-220020068_456249206: start_err=5s, end_err=0s, IoU=0.688
-220020068_456249376: start_err=5s, end_err=1s, IoU=0.625
-220020068_456249243: start_err=5s, end_err=1s, IoU=0.625
-220020068_456249344: start_err=5s, end_err=0s, IoU=0.688
-220020068_456249716: start_err=6s, end_err=7s, IoU=0.188
-220020068_456249259: start_err=5s, end_err=1s, IoU=0.625
-220020068_456255389: start_err=0s, end_err=8s, IoU=0.500
-220020068_456249720: start_err=6s, end_err=7s, IoU=0.188
-220020068_456249272: start_err=5s, end_err=1s, IoU=