# 函式庫


In [None]:
%%writefile requirements.txt
opencc-python-reimplemented
opencv-python-headless
opencv-python
torch>=2.0.0
torchvision
transformers>=4.38.0
accelerate>=0.24.1
Pillow>=10.0.0
tqdm
ffmpeg-python
sentencepiece
sacremoses
numpy
einops
requests
pycocotools
timm
matplotlib
pandas
huggingface-hub
supervision
groundingdino
omegaconf


Overwriting requirements.txt


In [None]:
!pip install -q -r requirements.txt
print("函式庫安裝完成。")


函式庫安裝完成。


In [None]:
from pathlib import Path
PROJECT_ROOT = Path('/content')
DATA_DIR = PROJECT_ROOT / 'data'
VIDEO_DIR = DATA_DIR / 'videos'
FRAME_DIR = DATA_DIR / 'frames'
OUTPUT_DIR = Path('outputs')
MODELS_DIR = Path('models')
for folder in [DATA_DIR, VIDEO_DIR, FRAME_DIR, OUTPUT_DIR, MODELS_DIR, Path('scripts')]:
    folder.mkdir(parents=True, exist_ok=True)
print('目錄準備完成。')


# 安裝並新增資料夾


In [None]:
print('資料夾初始化完成。')


In [None]:
import requests
from pathlib import Path
from tqdm import tqdm

GROUNDING_DINO_CONFIG_URL = 'https://raw.githubusercontent.com/IDEA-Research/GroundingDINO/main/groundingdino/config/GroundingDINO_SwinT_OGC.py'
GROUNDING_DINO_WEIGHTS_URL = 'https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0/groundingdino_swint_ogc.pth'

config_path = Path('models/GroundingDINO_SwinT_OGC.py')
weights_path = Path('models/groundingdino_swint_ogc.pth')

def download_file(url: str, destination: Path) -> None:
    if destination.exists():
        print(f'{destination.name} 已存在，略過下載。')
        return
    response = requests.get(url, stream=True)
    response.raise_for_status()
    total = int(response.headers.get('content-length', 0))
    progress = tqdm(total=total, unit='B', unit_scale=True, desc=destination.name)
    with destination.open('wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
                progress.update(len(chunk))
    progress.close()
    print(f'已下載 {destination.name}')

download_file(GROUNDING_DINO_CONFIG_URL, config_path)
download_file(GROUNDING_DINO_WEIGHTS_URL, weights_path)


MediaPipe 姿態偵測模型已存在，跳過下載。


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 影片上傳


In [None]:
from google.colab import files
import shutil
from pathlib import Path

uploaded = files.upload()
video_paths = []
for fname in uploaded.keys():
    dst_dir = Path('data/videos')
    dst_dir.mkdir(parents=True, exist_ok=True)
    dst = dst_dir / fname
    shutil.move(fname, dst)
    video_paths.append(str(dst))
print('影片已上傳到:', video_paths)


Saving Fall backward while walking and turning1_0.mp4 to Fall backward while walking and turning1_0.mp4
Saving Fall backward while walking and turning1_1.mp4 to Fall backward while walking and turning1_1.mp4
Saving Fall backward while walking and turning1_2.mp4 to Fall backward while walking and turning1_2.mp4
Saving Fall backward while walking and turning1_3.mp4 to Fall backward while walking and turning1_3.mp4
Saving Fall backward while walking and turning1_4.mp4 to Fall backward while walking and turning1_4.mp4
 影片已放到： ['data/videos/Fall backward while walking and turning1_0.mp4', 'data/videos/Fall backward while walking and turning1_1.mp4', 'data/videos/Fall backward while walking and turning1_2.mp4', 'data/videos/Fall backward while walking and turning1_3.mp4', 'data/videos/Fall backward while walking and turning1_4.mp4']


# huggingface


In [None]:
# hugging face登錄
from huggingface_hub import login
from google.colab import userdata
login(userdata.get('HF_TOKEN'))

# extract_frames


In [None]:
%%writefile scripts/extract_frames.py
import os
from typing import List, Tuple
import cv2

def extract_frames(video_path: str, output_dir: str, target_fps: float = 1.0) -> Tuple[List[Tuple[str, float]], float]:
    os.makedirs(output_dir, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise RuntimeError(f'無法開啟影片: {video_path}')
    native_fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    step = max(int(round(native_fps / max(target_fps, 1e-3))), 1)
    frame_idx = 0
    saved: List[Tuple[str, float]] = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_idx % step == 0:
            timestamp = frame_idx / native_fps
            frame_name = f'frame_{frame_idx:06d}.jpg'
            frame_path = os.path.join(output_dir, frame_name)
            cv2.imwrite(frame_path, frame)
            saved.append((frame_path, timestamp))
        frame_idx += 1
    cap.release()
    return saved, native_fps


Writing scripts/extract_frames.py


# detect_objects

In [None]:
%%writefile scripts/object_detector.py
from __future__ import annotations
from typing import List, Dict, Tuple, Optional

import torch
from PIL import Image

from groundingdino.util.inference import Model

DEFAULT_PROMPT = ('person, bed, pillow, blanket, nightstand, lamp, stove, refrigerator, sink, cabinet, microwave, pan, '
                  'pot, knife, fork, spoon, cup, bowl, toilet, shower, bathtub, towel, mirror, sofa, chair, table, '
                  'television, remote, plant, window, door, carpet')

_model: Optional[Model] = None

def setup_grounding_dino(config_path: str, weights_path: str, device: Optional[str] = None) -> Model:
    global _model
    if _model is not None:
        return _model
    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
    _model = Model(model_config_path=config_path, model_checkpoint_path=weights_path, device=device)
    return _model

def detect_objects(
    image_path: str,
    text_prompt: str = DEFAULT_PROMPT,
    box_threshold: float = 0.35,
    text_threshold: float = 0.25,
    top_k: int = 15,
) -> Tuple[List[Dict], Optional[Image.Image]]:
    if _model is None:
        raise RuntimeError('請先呼叫 setup_grounding_dino() 載入模型。')
    image_source, image = _model.load_image(image_path)
    boxes, logits, phrases = _model.predict_with_caption(
        image=image,
        caption=text_prompt,
        box_threshold=box_threshold,
        text_threshold=text_threshold,
    )

    detections: List[Dict] = []
    for box, logit, phrase in zip(boxes, logits, phrases):
        x1, y1, x2, y2 = box.tolist()
        detections.append(
            {
                'label': phrase.lower(),
                'score': float(torch.sigmoid(torch.tensor(logit))),
                'bbox_xyxy': [float(x1), float(y1), float(x2), float(y2)],
            }
        )
    detections = sorted(detections, key=lambda d: d['score'], reverse=True)[:top_k]

    annotated = None
    if boxes.size(0) > 0:
        annotated_array = _model.annotate(
            image_source=image_source,
            boxes=boxes[: len(detections)],
            logits=logits[: len(detections)],
            phrases=phrases[: len(detections)],
        )
        annotated = Image.fromarray(annotated_array)

    return detections, annotated

def summarize_detections(detections: List[Dict], top_k: int = 5) -> str:
    if not detections:
        return '未偵測到重點物件'
    items = [f"{d['label']}({d['score']:.2f})" for d in detections[:top_k]]
    return ', '.join(items)


Writing scripts/object_detector.py


# 初始化

In [None]:
from scripts.object_detector import setup_grounding_dino, DEFAULT_PROMPT
GROUNDING_DINO_CONFIG = 'models/GroundingDINO_SwinT_OGC.py'
GROUNDING_DINO_WEIGHTS = 'models/groundingdino_swint_ogc.pth'
setup_grounding_dino(GROUNDING_DINO_CONFIG, GROUNDING_DINO_WEIGHTS)
print('Grounding DINO 模型已載入。')


已將 './scripts' 加入 Python 搜尋路徑。
導入核心模組失敗: No module named 'model_loader'
請確認 'scripts/' 資料夾中存在所有必要的 .py 檔案。

模型路徑和其他常數設定完成。
BLIP2 基礎模型將從 'models/blip2_model' 載入。
LoRA 微調權重將從 'models/loha_blip2_weights.pt' 載入。
將使用的 event_timeline.csv 路徑: '/content/drive/MyDrive/Colab_Video_Frames_Output/event_timeline.csv'
PoseEstimator 初始化失敗: name 'PoseEstimator' is not defined
請檢查姿態模型檔案是否存在和完整性。
ObjectDetector 初始化失敗: name 'ObjectDetector' is not defined
請檢查 Grounding DINO 和 SAM 模型下載/載入情況，以及相關依賴。


In [None]:
CUSTOM_GROUNDING_PROMPT = DEFAULT_PROMPT
print('Grounding DINO 提示詞已設定，可依需求自行調整 CUSTOM_GROUNDING_PROMPT。')


# 物件偵測與敘述整合


In [None]:
# 舊的影像分割流程已移除，改由 Grounding DINO 產生標註。


Writing scripts/segment_objects.py


# 文字敘述生成


In [None]:
# 已移除姿態與風險偵測模組，改於敘述階段整合物件資訊。


Writing scripts/analyze_pose.py


In [None]:
%%writefile scripts/postcheck.py
# -*- coding: utf-8 -*-
"""
Post-processing helper

fix_caption() 可以用來：
  • 去重字詞 / 修正標點
  • 合併相似字幕
  • 依需求再擴充

先放最小版本──僅原樣返回，保證主程式能跑通。
"""

def fix_caption(caption: str) -> str:
    # TODO: 之後在這裡寫真正的後處理邏輯
    return caption


Writing scripts/postcheck.py


# fall demo(暫時沒用到)


# utils

In [None]:
%%writefile scripts/utils.py
from __future__ import annotations
from pathlib import Path
from typing import List, Dict

def ensure_dir(path: Path | str) -> Path:
    p = Path(path)
    p.mkdir(parents=True, exist_ok=True)
    return p

def save_txt(lines: List[str], path: Path | str) -> None:
    target = Path(path)
    target.parent.mkdir(parents=True, exist_ok=True)
    with target.open('w', encoding='utf-8') as f:
        f.write('\n'.join(lines))

def format_timestamp(seconds: float) -> str:
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = seconds % 60
    return f'{hours:02d}:{minutes:02d}:{secs:05.2f}'

def format_detection_brief(timestamp: float, detections: List[Dict]) -> str:
    if not detections:
        return f'[{format_timestamp(timestamp)}] 未偵測到關鍵物件'
    details = ', '.join(f"{d['label']}:{d['score']:.2f}" for d in detections[:8])
    return f'[{format_timestamp(timestamp)}] ' + details


Writing scripts/utils.py


In [None]:
# 危險物品偵測已移除，保留檔案供後續擴充時使用。


Writing scripts/danger_utils.py


# fall util


In [None]:
# 跌倒與行為風險偵測將另行實作，此處不再載入相關模組。


Writing fall_utils.py


# generate

In [None]:
%%writefile scripts/generate_caption.py
from __future__ import annotations
from typing import List, Dict, Optional

import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline
from opencc import OpenCC

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
PROCESSOR = BlipProcessor.from_pretrained('Salesforce/blip-image-captioning-large')
MODEL = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-large').to(DEVICE)
TRANSLATOR = pipeline('translation', model='Helsinki-NLP/opus-mt-en-zh')
CONVERTER = OpenCC('s2t')

OBJECT_TRANSLATIONS = {
    'person': '人物',
    'bed': '床',
    'pillow': '枕頭',
    'blanket': '棉被',
    'nightstand': '床頭櫃',
    'lamp': '檯燈',
    'stove': '爐子',
    'refrigerator': '冰箱',
    'sink': '水槽',
    'cabinet': '櫥櫃',
    'microwave': '微波爐',
    'pan': '平底鍋',
    'pot': '湯鍋',
    'knife': '刀具',
    'fork': '叉子',
    'spoon': '湯匙',
    'cup': '杯子',
    'bowl': '碗',
    'toilet': '馬桶',
    'shower': '淋浴設備',
    'bathtub': '浴缸',
    'towel': '毛巾',
    'mirror': '鏡子',
    'sofa': '沙發',
    'couch': '沙發',
    'chair': '椅子',
    'table': '桌子',
    'television': '電視',
    'tv': '電視',
    'remote': '遙控器',
    'plant': '植物',
    'window': '窗戶',
    'door': '門',
    'carpet': '地毯',
    'oven': '烤箱',
    'coffee table': '茶几',
    'washing machine': '洗衣機',
    'dryer': '烘衣機',
    'shelf': '架子',
    'book': '書本',
}

ROOM_KEYWORDS = {
    '臥室': {'bed', 'pillow', 'blanket', 'nightstand'},
    '廚房': {'stove', 'oven', 'refrigerator', 'microwave', 'pan', 'pot', 'sink', 'cup', 'bowl'},
    '廁所': {'toilet', 'shower', 'bathtub', 'towel', 'sink'},
    '客廳': {'sofa', 'couch', 'television', 'tv', 'remote', 'coffee table', 'plant'},
}

@torch.no_grad()
def _generate_raw_caption(image_path: str, prompt: str = 'Provide a highly detailed description of the scene.') -> str:
    image = Image.open(image_path).convert('RGB')
    inputs = PROCESSOR(images=image, text=prompt, return_tensors='pt').to(DEVICE)
    output = MODEL.generate(**inputs, max_length=80, num_beams=5, no_repeat_ngram_size=2)
    caption = PROCESSOR.decode(output[0], skip_special_tokens=True)
    return caption

def _translate_to_traditional(text: str) -> str:
    result = TRANSLATOR(text)[0]['translation_text']
    return CONVERTER.convert(result)

def infer_room_from_detections(detections: List[Dict]) -> Optional[str]:
    labels = {d['label'].lower() for d in detections}
    for room, cues in ROOM_KEYWORDS.items():
        if labels & cues:
            return room
    return None

def augment_with_knowledge_graph(caption: str, detections: List[Dict], knowledge_items: Optional[List[str]] = None) -> str:
    """預留與 KBERT 等知識圖譜結果整合的介面，目前直接回傳原敘述。"""
    if not knowledge_items:
        return caption
    extra = '；'.join(knowledge_items)
    return f"{caption} 知識圖譜補充：{extra}"

def generate_detailed_caption(
    image_path: str,
    detections: List[Dict],
    room_hint: Optional[str] = None,
    knowledge_items: Optional[List[str]] = None,
) -> str:
    base_en = _generate_raw_caption(image_path)
    base_zh = _translate_to_traditional(base_en).strip()

    sentences = [base_zh if base_zh.endswith('。') else f"{base_zh}。"]

    if detections:
        translated: List[str] = []
        seen = set()
        for det in detections:
            label = det['label'].lower()
            if label in seen:
                continue
            seen.add(label)
            zh_label = OBJECT_TRANSLATIONS.get(label, det['label'])
            translated.append(f"{zh_label}（信心 {det['score']:.2f}）")
            if len(translated) == 5:
                break
        if translated:
            sentences.append('畫面中的重要物件：' + '、'.join(translated) + '。')

    if room_hint:
        sentences.append(f"依據物件線索推測場景位置：{room_hint}。")

    caption = ' '.join(sentences)
    return augment_with_knowledge_graph(caption, detections, knowledge_items)

__all__ = [
    'generate_detailed_caption',
    'infer_room_from_detections',
    'augment_with_knowledge_graph',
]






Writing scripts/generate_caption.py


In [None]:
from scripts.generate_caption import generate_detailed_caption, infer_room_from_detections
print('BLIP 模型已準備，使用 generate_detailed_caption 產生多句敘述。')


BLIP 模型運行設備: cuda


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


未設定微調模型路徑、路徑無效或 'peft' 庫未加載。加載原始 BLIP 基礎模型。


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/312M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


# 微調


In [None]:
%%writefile scripts/train_caption_model.py
def fine_tune_blip_captioning(*args, **kwargs):
    print("[Stub] fine_tune_blip_captioning() 被呼叫，但目前尚未實作。")


Writing scripts/train_caption_model.py


In [None]:
from scripts.train_caption_model import fine_tune_blip_captioning

# 設定你的訓練數據路徑和模型保存路徑
# TRAIN_IMAGE_BASE_PATH 應該指向你影片幀實際存放的目錄
TRAIN_IMAGE_BASE_PATH = "/content/drive/MyDrive/Colab_Subtitles_Output/frames/my_test_video1/"
# TRAIN_CAPTION_CSV_PATH 應該指向你剛剛創建的 train_captions.csv 檔案
TRAIN_CAPTION_CSV_PATH = "/content/drive/MyDrive/Colab_Subtitles_Output/train_captions.csv"
# FINE_TUNED_MODEL_OUTPUT_DIR 是微調後模型（LoRA 適配器）的保存路徑
# 這個路徑會被 scripts/generate_caption.py 自動嘗試加載，所以請保持一致
FINE_TUNED_MODEL_OUTPUT_DIR = "/content/drive/MyDrive/Colab_FineTuned_Model"

print("開始執行模型微調，這可能需要一些時間（根據數據量和 num_train_epochs 設定）。")
fine_tune_blip_captioning(
    train_data_path=TRAIN_CAPTION_CSV_PATH,
    image_base_path=TRAIN_IMAGE_BASE_PATH,
    model_output_dir=FINE_TUNED_MODEL_OUTPUT_DIR,
    num_train_epochs=10,       # 訓練的輪次，可以從 3-5 次開始嘗試，觀察效果
    per_device_train_batch_size=4, # 批次大小，根據 GPU 記憶體調整 (Colab 通常建議 4-8)
    learning_rate=1e-4        # 學習率，微調通常用較小的學習率 (例如 5e-5 或 1e-4)
)
print("模型微調執行完成！微調後的模型已保存到 Google Drive。")

開始執行模型微調，這可能需要一些時間（根據數據量和 num_train_epochs 設定）。
[Stub] fine_tune_blip_captioning() 被呼叫，但目前尚未實作。
模型微調執行完成！微調後的模型已保存到 Google Drive。


# clip


In [None]:
%%writefile scripts/clip_eval.py
# scripts/clip_eval.py
import torch, numpy as np
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

_device = "cuda" if torch.cuda.is_available() else "cpu"
_model  = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(_device)
_proc   = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def score(image: Image.Image, caption: str) -> float:
    """回傳 0-100，相似度 * 100"""
    inputs = _proc(
        text=[caption],
        images=image,
        return_tensors="pt",
        padding=True,
        truncation=True,          #  保留
        max_length=77             #  新增：強制切到 77 token 以內
    ).to(_device)

    with torch.no_grad():
        out = _model(**inputs)

    img_emb, txt_emb = out.image_embeds, out.text_embeds
    sim = torch.cosine_similarity(img_emb, txt_emb).item()
    return sim * 100


Writing scripts/clip_eval.py


In [None]:
%%writefile scripts/clip_caption.py
# scripts/clip_caption.py
from typing import List, Dict
import cv2

def run_clip_caption(video_path: str) -> List[Dict]:
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS) or 30
    frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0
    cap.release()

    duration = frames / fps
    seg = 5.0
    captions = []
    t = 0.0
    while t < duration:
        captions.append({"start": t, "end": min(t + seg, duration), "text": "dummy caption"})
        t += seg
    return captions


Writing scripts/clip_caption.py


# main


In [None]:
%%writefile main.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Video pipeline: Grounding DINO 物件偵測 + BLIP 敘述生成，輸出文字字幕與偵測標註。"""

import argparse
from pathlib import Path
from typing import Iterable

from tqdm import tqdm

VIDEO_EXTS = {'.mp4', '.mov', '.mkv', '.avi'}

from scripts.extract_frames import extract_frames
from scripts.object_detector import setup_grounding_dino, detect_objects, summarize_detections, DEFAULT_PROMPT
from scripts.generate_caption import generate_detailed_caption, infer_room_from_detections
from scripts.utils import ensure_dir, save_txt, format_detection_brief, format_timestamp


def iter_videos(targets: Iterable[Path]) -> Iterable[Path]:
    for target in targets:
        target = Path(target)
        if target.is_file() and target.suffix.lower() in VIDEO_EXTS:
            yield target
        elif target.is_dir():
            for path in sorted(target.rglob('*')):
                if path.suffix.lower() in VIDEO_EXTS:
                    yield path


def process_video(
    video_path: Path,
    output_root: Path,
    fps_sample: float,
    text_prompt: str,
) -> Path:
    video_path = Path(video_path)
    video_output = ensure_dir(output_root / video_path.stem)
    frames_dir = ensure_dir(video_output / 'frames')
    detections_dir = ensure_dir(video_output / 'detections')

    frame_entries, native_fps = extract_frames(str(video_path), str(frames_dir), target_fps=fps_sample)
    if not frame_entries:
        raise RuntimeError(f'影片 {video_path} 未擷取到任何影格，請確認檔案是否為有效影片。')

    subtitle_lines = []
    detection_lines = []

    for idx, (frame_path, timestamp) in enumerate(tqdm(frame_entries, desc=f'推論 {video_path.name}')):
        detections, annotated = detect_objects(frame_path, text_prompt=text_prompt)
        if annotated is not None:
            annotated_path = detections_dir / f'{Path(frame_path).stem}_det.jpg'
            annotated.save(annotated_path)
        room_hint = infer_room_from_detections(detections)
        caption = generate_detailed_caption(frame_path, detections, room_hint=room_hint)
        subtitle_lines.append(f'[{format_timestamp(timestamp)}] {caption}')
        detection_lines.append(format_detection_brief(timestamp, detections))
        if idx < 3:
            print(f"偵測預覽 {Path(frame_path).name}: {summarize_detections(detections)}")

    subs_path = video_output / f'{video_path.stem}.txt'
    save_txt(subtitle_lines, subs_path)
    det_path = video_output / f'{video_path.stem}_detections.txt'
    save_txt(detection_lines, det_path)

    print(f'原始影格 FPS：約 {native_fps:.2f}')
    print(f'字幕輸出：{subs_path}')
    print(f'偵測摘要輸出：{det_path}')
    print(f'標註影像存放於：{detections_dir}')
    return subs_path


def main() -> None:
    parser = argparse.ArgumentParser(description='Grounding DINO + BLIP 影片敘述流水線')
    parser.add_argument('--video', help='指定單一影片路徑')
    parser.add_argument('--dir', help='處理資料夾下的所有影片')
    parser.add_argument('--fps', type=float, default=1.0, help='取樣頻率(每秒擷取影格數)')
    parser.add_argument('--output-dir', default='outputs', help='輸出根目錄')
    parser.add_argument('--prompt', default=DEFAULT_PROMPT, help='Grounding DINO 文字提示')
    parser.add_argument('--config', default='models/GroundingDINO_SwinT_OGC.py', help='Grounding DINO 設定檔路徑')
    parser.add_argument('--weights', default='models/groundingdino_swint_ogc.pth', help='Grounding DINO 權重檔路徑')
    args = parser.parse_args()

    if not args.video and not args.dir:
        parser.error('請至少指定 --video 或 --dir 其中之一。')

    setup_grounding_dino(args.config, args.weights)

    targets = []
    if args.video:
        targets.append(Path(args.video))
    if args.dir:
        targets.append(Path(args.dir))

    output_root = ensure_dir(args.output_dir)

    for video in iter_videos(targets):
        print(f'
=== 處理影片：{video} ===')
        process_video(video, output_root, fps_sample=args.fps, text_prompt=args.prompt)


if __name__ == '__main__':
    main()




Writing main.py


# 執行流程


In [None]:
from google.colab import drive
import os, glob, shutil

drive.mount('/content/drive', force_remount=True)
OUTPUT_BASE = '/content/drive/MyDrive/video_captions'
os.makedirs(OUTPUT_BASE, exist_ok=True)

!python /content/main.py --dir data/videos --fps 1.0 --output-dir outputs

for txt_path in glob.glob('outputs/**/*.txt', recursive=True):
    dst = os.path.join(OUTPUT_BASE, os.path.basename(txt_path))
    shutil.copy(txt_path, dst)
    print('已複製到雲端:', dst)


Mounted at /content/drive
2025-08-17 07:15:04.230141: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755414904.265595    4973 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755414904.271965    4973 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755414904.288142    4973 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755414904.288177    4973 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755414904.288181    4973 computation_placer.cc:1

In [None]:
# !python /content/main.py --video data/videos/your_video.mp4 --fps 1.0 --output-dir outputs


2025-08-17 07:35:16.219843: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755416116.240114   10142 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755416116.246665   10142 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755416116.262542   10142 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755416116.262571   10142 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1755416116.262575   10142 computation_placer.cc:177] computation placer alr

# Grounding DINO 偵測視覺化


In [None]:
import glob
from pathlib import Path
from PIL import Image
from IPython.display import display

annotated_paths = sorted(glob.glob('outputs/*/detections/*.jpg'))
print('找到標註影像檔案數量:', len(annotated_paths))
for path in annotated_paths[:5]:
    print('預覽:', path)
    display(Image.open(path))


In [None]:
# 預留測試區塊。
