# imports

In [19]:
import tempfile
import os
import glob
import json
import cv2
import numpy as np
from tqdm import tqdm
import torch
import mmcv
from mmseg.apis import init_model, inference_model
from skimage.metrics import structural_similarity as ssim
import csv
import matplotlib.pyplot as plt


# global parameters

In [20]:
video_folder = "C:/Users/yutse/OneDrive/桌面/cg/dataset_video/video"  # 影片資料夾
output_folder = "C:/Users/yutse/OneDrive/桌面/cg/image_score_test/output"  # 輸出總資料夾
output_npz = "C:/Users/yutse/OneDrive/桌面/cg/dataset_video/all_videos_data.npz"

# mmseg model config / checkpoint --- 請修改為你本機路徑
mmseg_config = '../mmSeg_trained_models/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
mmseg_checkpoint = '../mmSeg_trained_models/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'

# MiDaS model 名稱（torch.hub）
midas_model_name = "DPT_Hybrid"  # 使用 DPT_Hybrid

# setup midas model

In [None]:
# ----------------------------------------------------
midas_model_name = "DPT_Hybrid"  # 使用 DPT_Hybrid
os.makedirs(output_folder, exist_ok=True)

# ------------------- 設備設定 -------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"使用裝置: {device}")

# ------------------- 載入 MiDaS -------------------
print("載入 MiDaS 模型...")
try:
    midas = torch.hub.load("intel-isl/MiDaS", midas_model_name).to(device).eval()
    midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")
    # DPT 用 dpt_transform，其它模型 transform 名稱可能不同
    if hasattr(midas_transforms, "dpt_transform"):
        midas_transform = midas_transforms.dpt_transform
    else:
        midas_transform = midas_transforms.default_transform
except Exception as e:
    raise RuntimeError(f"載入 MiDaS 失敗: {e}")


使用裝置: cuda
載入 MiDaS 模型...


Using cache found in C:\Users\yutse/.cache\torch\hub\intel-isl_MiDaS_master
Using cache found in C:\Users\yutse/.cache\torch\hub\intel-isl_MiDaS_master


## setup seg model

In [22]:
# ------------------- 載入 mmseg -------------------
print("載入 mmseg 模型...")
if not os.path.exists(mmseg_config) or not os.path.exists(mmseg_checkpoint):
    raise FileNotFoundError("請確認 mmseg 的 config 與 checkpoint 路徑是否正確。")
MMSEG_DEVICE = f"cuda:0" if device.startswith("cuda") else "cpu"
SEG_MODEL = init_model(mmseg_config, mmseg_checkpoint, device=device)

# 從模型 meta 取得類別與調色盤
CLASSES = SEG_MODEL.dataset_meta.get('classes', None)
PALETTE = SEG_MODEL.dataset_meta.get('palette', None)
if CLASSES is None:
    raise RuntimeError("無法從 mmseg model 取得 classes metadata。")

# 人造 vs 自然 類別集合（根據你之前定義，可自行擴充或修改）
HUMAN_MADE = {'road','sidewalk','building','wall','fence','pole',
            'traffic light','traffic sign','car','truck','bus','train',
            'motorcycle','bicycle'}
# ----------------------------------------------------


載入 mmseg 模型...
Loads checkpoint by local backend from path: ../mmSeg_trained_models/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth


# funcs

In [23]:
def iter_video_frames(folder_path, n_frames=1):
    """
    掃描 folder_path 底下所有 .mp4 檔案，逐幀回傳。
    
    Yield:
        (video_path, frame_index, frame_bgr)
    """
    folder_path = os.path.abspath(folder_path)
    video_list = [
        os.path.join(folder_path, f)
        for f in os.listdir(folder_path)
        if f.lower().endswith(".mp4")
    ]

    for video_path in sorted(video_list):
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            print(f"[Warning] 無法開啟影片: {video_path}")
            continue

        frame_idx = 0
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            yield video_path, frame_idx, frame
            frame_idx += n_frames

        cap.release()

In [24]:
def to_bgr(frame):
    """
    將輸入影像安全轉換成 BGR (H, W, 3, uint8)。
    支援以下輸入：
      - BGR (直接回傳)
      - RGB
      - RGBA / BGRA
      - Gray (H, W)
      - float 或 0~1 資料（自動 clip 與轉 uint8）
    """
    if frame is None:
        raise ValueError("to_bgr() 收到 None")

    arr = np.array(frame)

    # 將 float 範圍調整成 uint8
    if arr.dtype != np.uint8:
        arr = np.clip(arr * 255 if arr.max() <= 1.0 else arr, 0, 255).astype(np.uint8)

    # 灰階 -> BGR
    if arr.ndim == 2:
        return cv2.cvtColor(arr, cv2.COLOR_GRAY2BGR)

    if arr.ndim != 3 or arr.shape[2] not in (3, 4):
        raise ValueError(f"無法處理的影像形狀: {arr.shape}")

    h, w, c = arr.shape

    # BGR
    if c == 3:
        # 判斷是否可能是 RGB
        # (簡單推論，無法完全保證，但足以應對一般使用)
        if np.mean(arr[...,2]) > np.mean(arr[...,0]):  
            return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)
        else:
            return arr.copy()

    # RGBA/BGRA -> BGR
    if c == 4:
        # 偵測是否 RGBA
        if np.mean(arr[...,2]) > np.mean(arr[...,0]):
            return cv2.cvtColor(arr, cv2.COLOR_RGBA2BGR)
        else:
            return cv2.cvtColor(arr, cv2.COLOR_BGRA2BGR)

## depth funcs

In [25]:
# ---------- depth helper ----------
def compute_depth_map(rgb_frame, midas_model, transform_fn, device, out_size=None):
    """
    rgb_frame: HxWx3 uint8 (RGB)
    回傳 float32 2D depth map（未標準化的相對深度）。
    """
    input_tensor = transform_fn(rgb_frame).to(device)
    with torch.no_grad():
        pred = midas_model(input_tensor)
        # 處理輸出維度差異
        if pred.ndim == 4:
            pred = pred.squeeze(0).squeeze(0)
        elif pred.ndim == 3:
            pred = pred.squeeze(0)
        pred_resized = torch.nn.functional.interpolate(
            pred.unsqueeze(0).unsqueeze(0),
            size=rgb_frame.shape[:2], mode="bicubic", align_corners=False
        ).squeeze().cpu().numpy()
    if out_size is not None:
        H_out, W_out = out_size
        if (pred_resized.shape[0], pred_resized.shape[1]) != (H_out, W_out):
            pred_resized = cv2.resize(pred_resized, (W_out, H_out), interpolation=cv2.INTER_CUBIC)
    return pred_resized.astype(np.float32)


## seg funcs

In [26]:
def compare_images(*images, titles=None):
    """
    比較多張影像，並可自訂標題。
    
    Args:
        *images: 任意數量的影像，BGR 格式 (mmcv.imread 讀取)
        titles: 可選，list of str，對應每張影像的標題
                如果未提供，預設依序為 ['原圖', '語意分割', '後處理', ...]
    """
    n = len(images)
    if titles is None:
        # 預設標題，依照影像數量生成
        default_titles = ['原圖', '語意分割', '後處理']
        titles = default_titles[:n] + [f'圖像{i+1}' for i in range(n - len(default_titles))]
    
    plt.figure(figsize=(5*n, 5))
    
    for i, img in enumerate(images):
        plt.subplot(1, n, i+1)
        # 如果是 BGR，轉 RGB
        if isinstance(img, np.ndarray) and img.shape[-1] == 3:
            plt.imshow(mmcv.bgr2rgb(img))
        else:
            plt.imshow(img)
        plt.title(titles[i])
        plt.axis('off')
    
    plt.tight_layout()
    plt.show()

In [27]:
def semantic_to_instance_masks(seg_map: np.ndarray, target_class: int, min_area=50):
    """
    從語意分割結果中提取某一類別的 instance mask
    seg_map: HxW (int) 語意標籤圖
    target_class: 要提取的類別 id
    min_area: 過濾小面積雜訊
    return: instance_masks (list of binary mask), labeled_map (HxW, 每個像素標記對應的 instance id)
    """
    binary_mask = (seg_map == target_class).astype(np.uint8) * 255
    
    # 找 connected components
    num_labels, labels = cv2.connectedComponents(binary_mask)
    
    instance_masks = []
    instance_id_map = np.zeros_like(seg_map, dtype=np.int32)
    
    instance_id = 1
    for label in range(1, num_labels):  # 0 是背景
        mask = (labels == label).astype(np.uint8)
        if cv2.countNonZero(mask) >= min_area:
            instance_masks.append(mask)
            instance_id_map[mask == 1] = instance_id
            instance_id += 1

    return instance_masks, instance_id_map

In [28]:
def compute_seg_frame(frame: np.ndarray, model=None):
    """
    只做 mmseg 推論（不視覺化）。
    輸入:
      frame : np.uint8 HxWx3 (BGR)
    回傳:
      frame (原 BGR)
      seg_map : np.int64 HxW (mmseg class index)
      seg_classified : np.int64 HxW  (0: nature, 1: human-made)
      (human_ratio, nature_ratio)
    """
    if model is None:
        try:
            model = SEG_MODEL
        except NameError:
            raise ValueError("未提供 model，且全域 SEG_MODEL 未定義。")

    tmp_path = None
    try:
        # 寫入暫存檔
        fd, tmp_path = tempfile.mkstemp(suffix=".jpg")
        os.close(fd)
        mmcv.imwrite(frame, tmp_path)

        # mmseg inference
        result = inference_model(model, tmp_path)
        sd = result[0] if isinstance(result, (list, tuple)) else result

        # 取得 seg_map
        if hasattr(sd, "pred_sem_seg"):
            seg_map = sd.pred_sem_seg
        elif isinstance(sd, dict) and "pred_sem_seg" in sd:
            seg_map = sd["pred_sem_seg"]
        elif isinstance(sd, np.ndarray):
            seg_map = sd
        else:
            raise ValueError("無法從 mmseg 結果取得 pred_sem_seg。")

        if hasattr(seg_map, "data"):
            seg_map = seg_map.data
        if isinstance(seg_map, torch.Tensor):
            seg_map = seg_map.squeeze().cpu().numpy()

        # 若為 (C,H,W) 機率地圖
        if getattr(seg_map, "ndim", 0) == 3:
            seg_map = seg_map.argmax(axis=0)

        seg_map = seg_map.astype(np.int64)

        # 重新分類: 0 = nature，1 = human-made
        seg_classified = np.zeros_like(seg_map, dtype=np.int64)

        # label_type[i] = 1 if human-made else 0
        label_type = np.array([1 if cls in HUMAN_MADE else 0 for cls in CLASSES],
                              dtype=np.int8)

        # 填入 seg_classified
        for i, cls in enumerate(CLASSES):
            mask = (seg_map == i)
            seg_classified[mask] = 1 if cls in HUMAN_MADE else 0

        # 計算比例
        total_pixels = seg_map.size if seg_map.size > 0 else 1
        human_pixels = int(np.sum(label_type[seg_map] == 1))
        nature_pixels = int(np.sum(label_type[seg_map] == 0))

        human_ratio = human_pixels / (total_pixels + 1e-8)
        nature_ratio = nature_pixels / (total_pixels + 1e-8)

        return frame, seg_map, seg_classified, (human_ratio, nature_ratio)

    finally:
        if tmp_path is not None and os.path.exists(tmp_path):
            try:
                os.remove(tmp_path)
            except Exception:
                pass


## img change rate

In [29]:
# 全域變數，用於保存前一幀
_prev_frame = None

def compute_img_change(frame: np.ndarray):
    """
    計算當前 frame 相對於上一幀的影像變化指標。
    
    輸入:
        frame : np.uint8 HxWx3 (BGR)
    
    回傳:
        change_metrics : dict，包含：
            'L1'      : float, L1 pixel 差異平均
            '1-SSIM'  : float, 1-SSIM
            'FlowMag' : float, 光流 magnitude mean
        若無上一幀，返回 None
    注意:
        - 此函式會維持內部前一幀狀態。
        - 若第一次呼叫，返回 None。
    """
    global _prev_frame
    if _prev_frame is None:
        _prev_frame = frame.copy()
        return {"L1": 0, "1-SSIM": 0, "FlowMag": 0}

    # ----- L1 差異 -----
    l1_val = np.mean(np.abs(_prev_frame.astype(np.float32) - frame.astype(np.float32)))

    # ----- 1-SSIM -----
    prev_gray = cv2.cvtColor(_prev_frame, cv2.COLOR_BGR2GRAY)
    curr_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    ssim_val = ssim(prev_gray, curr_gray)
    one_minus_ssim = 1.0 - ssim_val

    # ----- 光流 FlowMag -----
    prev_gray_f = prev_gray.astype(np.float32)
    curr_gray_f = curr_gray.astype(np.float32)
    flow = cv2.calcOpticalFlowFarneback(prev_gray_f, curr_gray_f,
                                        None,
                                        pyr_scale=0.5, levels=3, winsize=15,
                                        iterations=3, poly_n=5, poly_sigma=1.2, flags=0)
    mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])
    flow_mag_mean = mag.mean()

    # 更新上一幀
    _prev_frame = frame.copy()

    return {"L1": l1_val, "1-SSIM": one_minus_ssim, "FlowMag": flow_mag_mean}


## viz funcs

In [30]:
def viz_hw(arr, title=None):
    """
    可視化 H×W numpy array。
    arr: np.ndarray, shape=(H, W)
    """
    if arr.ndim != 2:
        raise ValueError("Input 必須是 H×W 的 2D numpy array")

    plt.figure(figsize=(6, 6))
    plt.imshow(arr, cmap="gray")
    plt.title(title if title else "Visualization")
    plt.axis("off")
    plt.tight_layout()
    plt.show()

In [37]:

def viz_seg_results(img_rgb, seg_map, seg_classified=None, classes=CLASSES, show_compare=True, processed_color_map=None):
    """
    負責視覺化語意分割結果（不再執行推論）。
    參數:
      img_rgb: HxWx3 (uint8) - 原圖
      seg_map: HxW int - 原始 class index map (同 process_seg_frame 回傳)
      seg_classified: HxW int (1 or 2) - 可選，若 None 將根據 classes 與 HUMAN_MADE 自行生成
      classes: list of class names 對應 seg_map 的 index
      show_compare: bool - 是否呼叫 compare_images 顯示原圖 / raw seg / processed seg
      processed_color_map: optional dict / list mapping class index -> RGB triplet
    行為:
      - 產生三張圖：原圖、原始語意分割上色、後處理（二值：人造/自然 上色）
      - 若有 compare_images helper，則以該 helper 顯示
      - 回傳 (seg_rgb, seg_processed_rgb) 以便後續儲存或其他處理
    """
    h, w = seg_map.shape[:2]

    # 預設 raw seg 顏色（與原 code 類似），若外部沒有提供則建立一組可重複使用的顏色
    default_colors = np.array([
        [128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
        [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0],
        [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60],
        [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100],
        [0, 80, 100], [0, 0, 230], [119, 11, 32]
    ], dtype=np.uint8)

    n_classes = len(classes)
    if default_colors.shape[0] < n_classes:
        # 若 class 多於定義顏色，循環使用顏色
        reps = int(np.ceil(n_classes / default_colors.shape[0]))
        default_colors = np.tile(default_colors, (reps, 1))[:n_classes]

    # raw seg 上色：根據 class index 把顏色取出
    seg_rgb = default_colors[seg_map]  # HxWx3

    # processed 顏色（人造 = 紅 / 自然 = 綠），可被外部覆寫
    if processed_color_map is None:
        processed_colors = np.zeros_like(default_colors)
        for i, cls in enumerate(classes):
            processed_colors[i] = [255, 0, 0] if cls in HUMAN_MADE else [0, 255, 0]
    else:
        # processed_color_map 可為 dict {idx: (r,g,b)} or list-like
        processed_colors = np.zeros_like(default_colors)
        if isinstance(processed_color_map, dict):
            for i in range(n_classes):
                if i in processed_color_map:
                    processed_colors[i] = np.array(processed_color_map[i], dtype=np.uint8)
                else:
                    processed_colors[i] = default_colors[i]
        else:
            # list-like
            arr = np.array(processed_color_map, dtype=np.uint8)
            if arr.shape[0] >= n_classes:
                processed_colors[:n_classes] = arr[:n_classes]
            else:
                processed_colors[:arr.shape[0]] = arr
                processed_colors[arr.shape[0]:n_classes] = default_colors[arr.shape[0]:n_classes]

    seg_processed_rgb = processed_colors[seg_map]

    # 若未提供 seg_classified，根據 classes 判斷
    if seg_classified is None:
        seg_classified = np.zeros((h, w), dtype=np.int64)
        for i, cls in enumerate(classes):
            mask = (seg_map == i)
            seg_classified[mask] = 1 if cls in HUMAN_MADE else 2

    # 顯示（如果有 compare_images helper）
    if show_compare:
        try:
            compare_images(img_rgb, seg_rgb, seg_processed_rgb, titles=['Original', 'Segmentation', 'Post-processed'])
        except Exception:
            # 若 compare_images 不可用，改用 mmcv.imshow 預覽（簡單 fallback）
            try:
                mmcv.imshow(img_rgb, win_name='Original')
                mmcv.imshow(seg_rgb, win_name='Segmentation')
                mmcv.imshow(seg_processed_rgb, win_name='Post-processed')
            except Exception:
                # 無法顯示時直接 pass
                pass

    return seg_rgb, seg_processed_rgb

## print funcs

In [38]:
def print_stats(arr, name="array", print_log =False):
    """
    列印 2D numpy array 的基本統計量：
    min, max, mean, Q1, Q3
    """
    if not isinstance(arr, np.ndarray):
        raise ValueError("arr 必須是 numpy array")
    if arr.ndim != 2:
        raise ValueError("arr 必須是 H×W 的 2D array")

    a = arr.astype(float).reshape(-1)

    q1 = np.percentile(a, 25)
    q3 = np.percentile(a, 75)

    if print_log:
        print(f"[{name}]")
        print(f"  max : {a.max():.6f}")
        print(f"  q3  : {q3:.6f}")
        print(f"  mean: {a.mean():.6f}")
        print(f"  q1  : {q1:.6f}")
        print(f"  min : {a.min():.6f}")

    return a.max(), q3, a.mean(), q1, a.min()

# main func

In [39]:

# -------------------- 影片 metadata --------------------
def save_video_metadata(vid_path, npz_path=None):
    """
    儲存單個影片的 metadata 到 npz 檔案。
    """
    cap = cv2.VideoCapture(vid_path)
    if not cap.isOpened():
        print(f"[Warning] Cannot open video: {vid_path}")
        return None
    
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frame_size = (height, width)
    cap.release()
    
    name = os.path.splitext(os.path.basename(vid_path))[0]
    metadata = {
        name: {
            "frame_count": frame_count,
            "frame_size": frame_size,
            "file_name": os.path.basename(vid_path)
        }
    }

    if npz_path:
        np.savez(npz_path, **metadata, allow_pickle=True)
        print(f"[INFO] Saved metadata to {npz_path}")

    return metadata

# -------------------- 處理單影片，每 n 幀計算 --------------------
def process_and_save_video(video_path, output_folder, n_frames=30):
    """
    處理單個影片，抽取每 n 幀計算 depth / segmentation / image change
    並存成單獨 npz 檔案。
    """
    video_name = os.path.splitext(os.path.basename(video_path))[0]
    output_npz = os.path.join(output_folder, f"{video_name}.npz")

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"[Warning] 無法開啟影片: {video_path}")
        return

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_idx = 0
    video_data = []

    with tqdm(total=total_frames, desc=f"Processing {video_name}") as pbar:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            if frame_idx % n_frames == 0:
                frame_vector = []

                # Depth
                depth_result = compute_depth_map(frame, midas, midas_transform, device)
                frame_vector.extend(print_stats(depth_result))

                # Segmentation
                img_rgb, seg_map, seg_classified, (human_ratio, nature_ratio) = compute_seg_frame(frame)
                frame_vector.append(human_ratio)
                frame_vector.append(nature_ratio)

                # Image Change
                metrics = compute_img_change(frame)
                if metrics is not None:
                    frame_vector.append(metrics["L1"])
                    frame_vector.append(metrics["1-SSIM"])
                    frame_vector.append(metrics["FlowMag"])
                else:
                    frame_vector.extend([0.0, 0.0, 0.0])

                frame_array = np.array(frame_vector, dtype=np.float32)
                video_data.append(frame_array)

            frame_idx += 1
            pbar.update(1)

    cap.release()

    # 轉成 2D np array 並存檔
    if video_data:
        video_data = np.stack(video_data, axis=0)
        np.savez_compressed(output_npz, data=video_data)
        print(f"[INFO] Saved npz for video: {output_npz}")

    # 釋放記憶體
    del video_data

# -------------------- 批次處理資料夾 --------------------
def process_folder_videos(folder_path, n_frames=30):
    """
    處理資料夾中所有 mp4 影片，每個影片單獨存 npz。
    """
    video_list = [
        os.path.join(folder_path, f)
        for f in os.listdir(folder_path)
        if f.lower().endswith(".mp4")
    ]
    video_list = sorted(video_list)

    for video_path in video_list:
        # 儲存 metadata
        save_video_metadata(video_path, npz_path=os.path.join(folder_path, f"{os.path.splitext(os.path.basename(video_path))[0]}_meta.npz"))
        # 處理影片
        process_and_save_video(video_path, output_folder=folder_path, n_frames=n_frames)


# main

# 執行所有video

In [41]:
# process_folder_videos(video_folder, n_frames=15)


#　查看特定frame 的深度 語意圖

In [None]:
video_path = "C:/Users/yutse/OneDrive/桌面/cg/dataset_video/video/LFMG_04.MP4"
n_frames =15
WANTED_FRAME_IDX = 777
video_name = os.path.splitext(os.path.basename(video_path))[0]
output_npz = os.path.join(output_folder, f"{video_name}.npz")

cap = cv2.VideoCapture(video_path)
 

total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_idx = 0
video_data = []

with tqdm(total=total_frames, desc=f"Processing {video_name}") as pbar:
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_idx  == WANTED_FRAME_IDX:
            frame_vector = []

            # Depth
            depth_result = compute_depth_map(frame, midas, midas_transform, device)
            viz_hw(depth_result, title="Depth Map")
            frame_vector.extend(print_stats(depth_result))

            # Segmentation
            img_rgb, seg_map, seg_classified, (human_ratio, nature_ratio) = compute_seg_frame(frame)
            viz_seg_results(img_rgb, seg_map, seg_classified, show_compare=True)
            frame_vector.append(human_ratio)
            frame_vector.append(nature_ratio)

            # Image Change
            metrics = compute_img_change(frame)
            if metrics is not None:
                frame_vector.append(metrics["L1"])
                frame_vector.append(metrics["1-SSIM"])
                frame_vector.append(metrics["FlowMag"])
            else:
                frame_vector.extend([0.0, 0.0, 0.0])

            frame_array = np.array(frame_vector, dtype=np.float32)
            video_data.append(frame_array)
            break
        frame_idx += 1
        pbar.update(1)