In [1]:
import torch
from depth_anything_v2.dpt_finetune import DepthAnythingV2

# 建立模型
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
if DEVICE == 'cpu':
    print("⚠️ 警告：CUDA 不可用，模型將在 CPU 上運行。")
model = DepthAnythingV2(encoder='vitl')
model = model.to(DEVICE)

print("=" * 80)
print("模型架構總覽")
print("=" * 80)
print(model)
print("\n" + "=" * 80)

模型架構總覽
DepthAnythingV2(
  (pretrained): DinoVisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
      (norm): Identity()
    )
    (blocks): ModuleList(
      (0-23): 24 x NestedTensorBlock(
        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (attn): MemEffAttention(
          (qkv): Linear(in_features=1024, out_features=3072, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1024, out_features=1024, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ls1): LayerScale()
        (drop_path1): Identity()
        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (drop): Dropout(p=0.0, inpl

In [2]:
# 詳細查看 4 個 depth head
print("\n" + "=" * 80)
print("4 個獨立的 Depth Heads")
print("=" * 80)
for i, head in enumerate(model.depth_heads):
    print(f"\n--- Head {i} ---")
    print(f"類型: {type(head).__name__}")
    num_params = sum(p.numel() for p in head.parameters())
    print(f"參數量: {num_params:,}")
print("\n" + "=" * 80)


4 個獨立的 Depth Heads

--- Head 0 ---
類型: DPTHead
參數量: 30,947,009

--- Head 1 ---
類型: DPTHead
參數量: 30,947,009

--- Head 2 ---
類型: DPTHead
參數量: 30,947,009

--- Head 3 ---
類型: DPTHead
參數量: 30,947,009



In [3]:
# 查看 encoder 和各部分的參數量
print("\n" + "=" * 80)
print("各模組參數統計")
print("=" * 80)

encoder_params = sum(p.numel() for p in model.pretrained.parameters())
print(f"Encoder (DINOv2): {encoder_params:,} 參數")

total_head_params = 0
for i, head in enumerate(model.depth_heads):
    head_params = sum(p.numel() for p in head.parameters())
    total_head_params += head_params
    print(f"Depth Head {i}: {head_params:,} 參數")

total_params = sum(p.numel() for p in model.parameters())
print(f"\n總參數量: {total_params:,}")
print(f"Encoder 比例: {encoder_params/total_params*100:.1f}%")
print(f"所有 Heads 比例: {total_head_params/total_params*100:.1f}%")
print("=" * 80)


各模組參數統計
Encoder (DINOv2): 304,368,640 參數
Depth Head 0: 30,947,009 參數
Depth Head 1: 30,947,009 參數
Depth Head 2: 30,947,009 參數
Depth Head 3: 30,947,009 參數

總參數量: 428,156,676
Encoder 比例: 71.1%
所有 Heads 比例: 28.9%


In [4]:
# 查看單一 depth head 的詳細架構
print("\n" + "=" * 80)
print("單一 Depth Head 的詳細架構 (以 Head 0 為例)")
print("=" * 80)
print(model.depth_heads[0])
print("=" * 80)


單一 Depth Head 的詳細架構 (以 Head 0 為例)
DPTHead(
  (projects): ModuleList(
    (0): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
    (1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1))
    (2-3): 2 x Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1))
  )
  (resize_layers): ModuleList(
    (0): ConvTranspose2d(256, 256, kernel_size=(4, 4), stride=(4, 4))
    (1): ConvTranspose2d(512, 512, kernel_size=(2, 2), stride=(2, 2))
    (2): Identity()
    (3): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  )
  (scratch): Module(
    (layer1_rn): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (layer2_rn): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (layer3_rn): Conv2d(1024, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (layer4_rn): Conv2d(1024, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (refinenet1): FeatureFusionBlock(
      (out

In [7]:
# 測試 forward pass 的輸出形狀
print("\n" + "=" * 80)
print("測試 Forward Pass")
print("=" * 80)

# 建立假的輸入
dummy_input = torch.randn(1, 3, 518, 518).cuda()
print(f"輸入形狀: {dummy_input.shape}")

# Forward pass
with torch.no_grad():
    outputs = model(dummy_input)

print(type(outputs))

print(f"\n輸出類型: {type(outputs)}")
print(f"輸出數量: {len(outputs)} 個 depth maps")
for i, output in enumerate(outputs):
    print(f"Head {i} 輸出形狀: {output.shape}")

print("=" * 80)


測試 Forward Pass
輸入形狀: torch.Size([1, 3, 518, 518])
<class 'list'>

輸出類型: <class 'list'>
輸出數量: 4 個 depth maps
Head 0 輸出形狀: torch.Size([1, 518, 518])
Head 1 輸出形狀: torch.Size([1, 518, 518])
Head 2 輸出形狀: torch.Size([1, 518, 518])
Head 3 輸出形狀: torch.Size([1, 518, 518])
