## 获取抠图后的视频

In [2]:
import cv2
import numpy as np
from pathlib import Path
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# video_dir = Path("./dataset/train")
video_dir = Path("./dataset/val")
mask_dir = Path("./dataset/gt")
save_dir = Path("./dataset/masked")
save_dir.mkdir(parents=True, exist_ok=True)

video_files = list(video_dir.glob("*.mp4"))

def process_video(video_path, mask_path, save_path):
    try:
        mask = np.load(mask_path)  # (T, H, W)
        T, H, W = mask.shape

        cap = cv2.VideoCapture(str(video_path))
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        assert frame_count == T, (
            f"帧数不一致: video({frame_count}) vs mask({T}) [{video_path.name}]"
        )
        fps = cap.get(cv2.CAP_PROP_FPS)
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        out = cv2.VideoWriter(str(save_path), fourcc, fps, (W, H))

        cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
        for t in range(T):
            ok, frame = cap.read()
            if not ok:
                raise RuntimeError(f"读取第 {t} 帧失败: {video_path.name}")
            m = mask[t].astype(bool)
            black_frame = frame.copy()
            black_frame[~m] = 0
            out.write(black_frame)
        cap.release()
        out.release()
        return str(save_path)
    except Exception as e:
        return f"❌ {video_path.name}: {e}"

# 多线程处理
max_workers = min(8, len(video_files))  # 可根据CPU数量调整

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    tasks = []
    for video_path in video_files:
        video_name = video_path.stem
        mask_path = mask_dir / f"{video_name}.npy"
        save_path = save_dir / f"{video_name}.mp4"
        if not mask_path.exists():
            print(f"❌ 缺少 mask: {mask_path}, 跳过 {video_path.name}")
            continue
        tasks.append(executor.submit(process_video, video_path, mask_path, save_path))

    for f in tqdm(as_completed(tasks), total=len(tasks), desc="批量处理视频"):
        result = f.result()
        # 可以选择输出异常或保存成功的路径
        if isinstance(result, str) and result.startswith("❌"):
            print(result)

print("✅ 全部处理完成！")


批量处理视频: 100%|██████████| 657/657 [00:24<00:00, 26.62it/s]

✅ 全部处理完成！





## 生成masked视频描述

In [2]:
# %% 推理函数
import numpy as np
import cv2, torch
from pathlib import Path
import PIL.Image as Image
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from qwen_vl_utils import process_vision_info

device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-32B-Instruct")
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-32B-Instruct",
    torch_dtype="auto",
    device_map="auto",
)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


Loading checkpoint shards:   0%|          | 0/18 [00:00<?, ?it/s]

In [3]:
# ---------- 辅助工具 ----------
def is_pure_black(frame: np.ndarray, thr: int = 1) -> bool:
    """判断单帧是否近似纯黑。thr 越大越宽松。"""
    return np.max(frame) < thr

def sample_frames(video_path: Path, num_frames: int = 8):
    """读取视频中所有帧并返回 PIL 图像列表。"""
    cap = cv2.VideoCapture(str(video_path))
    frames = []
    while True:
        ok, frame = cap.read()
        if not ok:
            break
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(Image.fromarray(frame_rgb))
    cap.release()
    return frames

def describe_video(video_path: Path, temperature: float = 0.1, fps: float = 24.0):
    """返回四元组字符串，如 [是][一个人][在奔跑][从视频左下角跑到了右上角]"""
    frames = sample_frames(video_path)
    # 纯黑快速判断（提高效率）
    if all(is_pure_black(np.array(f)) for f in frames):
        return "没有异常"
    else:
        return "有异常"

    # 构建 messages
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": str(video_path),
                    "fps": fps,
                },
                {
                    "type": "text",
                    "text": "非黑色区域中有什么？简洁回答目标名称（如：一个骑自行车的人，一辆汽车，一群人在打闹...）",
                },
            ],
        }
    ]

    # 构建输入文本（包含 Chat 模板）
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # 处理视频与文本输入
    image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
        **video_kwargs,
    )
    inputs = inputs.to(device)

    # 推理
    generated_ids = model.generate(**inputs, max_new_tokens=512, temperature=temperature)

    # 去除输入 prompt 部分的 token（如你希望保留完整输出也可以不裁剪）
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]

    # 解码输出文本
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0].strip()

    return output_text


In [4]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

masked_dir = Path("./dataset/masked")
save_csv = Path("dataset/labels.csv")

video_paths = list(masked_dir.glob("*.mp4"))
results = []

for i, vp in tqdm(enumerate(video_paths, 1)):
    video_name = vp.name
    # print(f"\n正在处理第{i}/{len(video_paths)}个视频：{video_name}")
    try:
        desc = describe_video(vp)
    except Exception as e:
        desc = f"❌{e}"
    # print(f"描述结果：{desc}")

    # 追加到结果
    results.append({"video": video_name, "description": desc})

    # 实时保存，防止中断丢数据
    df = pd.DataFrame(results)
    df.to_csv(save_csv, index=False, encoding="utf-8-sig")
    # print(f"✅ 已保存进度到 {save_csv}")

# 最终显示
df = pd.DataFrame(results)
print(df.head())


3283it [02:25, 22.63it/s]

                 video description
0  01_0014-000_023.mp4        没有异常
1  01_0014-012_035.mp4        没有异常
2  01_0014-024_047.mp4        没有异常
3  01_0014-036_059.mp4        没有异常
4  01_0014-048_071.mp4        没有异常



