In [None]:
import torch
import torch.nn.functional as F
from safetensors.numpy import save_file, load_file
from omegaconf import OmegaConf
from transformers import AutoConfig
import cv2
from PIL import Image
import numpy as np
import json
import os
from diffusers import StableDiffusionPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipelineLegacy, StableDiffusionInpaintPipeline, DDIMScheduler, AutoencoderKL
from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel, DDIMScheduler
from diffusers import DDIMScheduler, DDPMScheduler, DPMSolverMultistepScheduler
#
from models.pipeline_mimicbrush import MimicBrushPipeline
from models.ReferenceNet import ReferenceNet
from models.depth_guider import DepthGuider
from mimicbrush import MimicBrush_RefNet
from dataset.data_utils import *
noise_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)


In [None]:
val_configs = OmegaConf.load('./configs/inference.yaml')

# === import Depth Anything ===
import sys
sys.path.append("./depthanything")
from torchvision.transforms import Compose
from depthanything.fast_import import depth_anything_model 
from depthanything.depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
transform = Compose([
    Resize(
        width=518,
        height=518,
        resize_target=False,
        keep_aspect_ratio=True,
        ensure_multiple_of=14,
        resize_method='lower_bound',
        image_interpolation_method=cv2.INTER_CUBIC,
    ),
    NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    PrepareForNet(),
])

depth_anything_model.load_state_dict(torch.load(val_configs.model_path.depth_model))


# === load the checkpoint ===
base_model_path = val_configs.model_path.pretrained_imitativer_path
vae_model_path = val_configs.model_path.pretrained_vae_name_or_path
image_encoder_path = val_configs.model_path.image_encoder_path
ref_model_path = val_configs.model_path.pretrained_reference_path
mimicbrush_ckpt = val_configs.model_path.mimicbrush_ckpt_path


In [None]:

vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)
unet = UNet2DConditionModel.from_pretrained(base_model_path, subfolder="unet", in_channels=13, low_cpu_mem_usage=False, ignore_mismatched_sizes=True).to(dtype=torch.float16)

pipe = MimicBrushPipeline.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    vae=vae,
    unet=unet,
    feature_extractor=None,
    safety_checker=None,
)

DEVICE = "cuda:1"
torch.cuda.set_device(DEVICE)

depth_guider = DepthGuider()
referencenet = ReferenceNet.from_pretrained(ref_model_path, subfolder="unet").to(dtype=torch.float16)
mimicbrush_model = MimicBrush_RefNet(pipe, image_encoder_path, mimicbrush_ckpt,  depth_anything_model, depth_guider, referencenet, DEVICE)

In [None]:
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'clip_image_processor', 'depth_estimator', 'depth_guider', 'device', 'generate', 'get_image_embeds', 'image_encoder', 'image_encoder_path', 'image_processor', 'image_proj_model', 'init_proj', 'load_checkpoint', 'model_ckpt', 'pipe', 'referencenet']

In [None]:
['clip_image_processor', 'depth_estimator', 'depth_guider', 'device', 'generate', 
 'get_image_embeds', 'image_encoder', 'image_encoder_path', 'image_processor', 
 'image_proj_model', 'init_proj', 'load_checkpoint', 'model_ckpt', 'pipe', 'referencenet']

In [None]:
mimicbrush_model.pipe.components

In [None]:
import torch
# from transformers import AutoModel
model = mimicbrush_model.pipe
total_params = 0

# 遍历模型的所有组件
for component_name in model.components:
    component = getattr(model, component_name)
    if hasattr(component, "parameters"):
        component_params = sum(p.numel() for p in component.parameters())
        print(f"{component_name}: {component_params:,} 参数 ({component_params/1e9:.3f}B)")
        total_params += component_params

# 打印总参数量
print(f"\n模型总参数量: {total_params:,}")
print(f"以百万为单位: {total_params/1e6:.2f}M")
print(f"以十亿为单位: {total_params/1e9:.2f}B")


model = mimicbrush_model
total_params = 0

# 遍历模型的所有组件

_components = ['image_encoder', 'depth_guider', 'referencenet']
for component_name in _components:
    component = getattr(model, component_name)
    if hasattr(component, "parameters"):
        component_params = sum(p.numel() for p in component.parameters())
        print(f"{component_name}: {component_params:,} 参数 ({component_params/1e9:.3f}B)")
        total_params += component_params

# 打印总参数量
print(f"\n模型总参数量: {total_params:,}")
print(f"以百万为单位: {total_params/1e6:.2f}M")
print(f"以十亿为单位: {total_params/1e9:.2f}B")

vae: 83,653,863 参数 (0.084B)
text_encoder: 123,060,480 参数 (0.123B)
unet: 859,546,884 参数 (0.860B)

模型总参数量: 1,066,261,227
以百万为单位: 1066.26M
以十亿为单位: 1.07B
image_encoder: 632,076,800 参数 (0.632B)
depth_guider: 150,260 参数 (0.000B)
referencenet: 857,066,560 参数 (0.857B)

模型总参数量: 1,489,293,620
以百万为单位: 1489.29M
以十亿为单位: 1.49B
