In [None]:
import torch
from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
from diffusers.utils import export_to_video

# * note : we need to use 

model_id = "tencent/HunyuanVideo"
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
    model_id, subfolder="transformer", torch_dtype=torch.bfloat16, revision='refs/pr/18'
)
pipe = HunyuanVideoPipeline.from_pretrained(model_id, transformer=transformer, revision='refs/pr/18', torch_dtype=torch.float16)

pipe.vae.enable_tiling()
pipe.to("cuda")

output = pipe(
    prompt="A cat walks on the grass, realistic",
    height=320,
    width=512,
    num_frames=61,
    num_inference_steps=30,
).frames[0]

# video frames is in [0,255]
export_to_video(output, "output.mp4", fps=15)

In [2]:
import diffusers
print(diffusers.__version__)


0.31.0


In [1]:
import os
import torch
import imageio
import numpy as np
from torchvision import transforms
from typing import Tuple, List, Dict, Type
import logging

logging.getLogger('imageio_ffmpeg').setLevel(logging.ERROR)

# Configuration Class
class Config:
    def __init__(
        self,
        model_class: Type,
        model_path: str,
        device: str = "cuda",
        dtype: str = "float16",
        batch_size: int = 1,
        custom_batch_size: Dict[str, int] = None,
        source_base: str = "../resources/videos/",
        output_base: str = "./output_videos/"
    ):
        self.model_class = model_class
        self.model_path = model_path
        self.device = device
        self.dtype = torch.float16 if dtype == "float16" else torch.bfloat16
        self.batch_size = batch_size
        self.custom_batch_size = custom_batch_size or {}
        self.source_base = source_base
        self.output_base = output_base

# GeneralAutoEncoderKL Class
class GeneralAutoEncoderKL:
    def __init__(self, config: Config):
        self.config = config
        self.device = torch.device(self.config.device)
        self.dtype = self.config.dtype

        os.makedirs(self.config.output_base, exist_ok=True)

        self.model = self.config.model_class.from_pretrained(
            self.config.model_path,
            torch_dtype=self.dtype
        ).to(self.device)

        print(f"Model loaded successfully from {self.config.model_path}.")

        self.model.disable_slicing()
        self.model.disable_tiling()

        self.transform = transforms.ToTensor()

    def preprocess_videos(self, video_paths: List[str]) -> Tuple[torch.Tensor, List[float], List[int], List[Tuple[int, int]]]:
        batch_frames = []
        fps_list, num_frames_list, resolutions = [], [], []

        for video_path in video_paths:
            video_reader = imageio.get_reader(video_path, "ffmpeg")
            meta_data = video_reader.get_meta_data()
            fps = meta_data.get('fps', 30)

            frames = [self.transform(frame) for frame in video_reader]
            video_reader.close()

            if not frames:
                raise ValueError(f"No frames found in video: {video_path}")

            num_frames = len(frames)
            resolution = frames[0].shape[1], frames[0].shape[2]

            fps_list.append(fps)
            num_frames_list.append(num_frames)
            resolutions.append(resolution)

            frames_tensor = torch.stack(frames).to(self.device).permute(1, 0, 2, 3)
            batch_frames.append(frames_tensor)

        batch_tensor = torch.stack(batch_frames).to(self.dtype)
        return batch_tensor, fps_list, num_frames_list, resolutions

    def encode(self, frames_tensor: torch.Tensor) -> torch.Tensor:
        with torch.no_grad():
            encoded_frames = self.model.encode(frames_tensor)[0].sample()
        return encoded_frames

    def decode(self, encoded_tensor: torch.Tensor) -> torch.Tensor:
        with torch.no_grad():
            decoded_frames = self.model.decode(encoded_tensor).sample
        return decoded_frames

    def save_videos(self, tensor: torch.Tensor, output_paths: List[str], fps_list: List[float],
                    num_frames_list: List[int], resolutions: List[Tuple[int, int]]):
        tensor = tensor.to(dtype=torch.float32)
        for i, (fps, num_frames, resolution, output_path) in enumerate(zip(fps_list, num_frames_list, resolutions, output_paths)):
            frames = tensor[i].permute(1, 2, 3, 0).cpu().numpy()
            frames = np.clip(frames, 0, 1) * 255
            frames = frames.astype(np.uint8)

            num_output_frames = frames.shape[0]
            assert num_output_frames == num_frames, (
                f"Frame count mismatch: input {num_frames} vs output {num_output_frames}")

            output_resolution = frames.shape[1], frames.shape[2]
            assert output_resolution == resolution, (
                f"Resolution mismatch: input {resolution} vs output {output_resolution}")

            writer = imageio.get_writer(output_path, fps=fps, codec='libx264')
            for frame in frames:
                writer.append_data(frame)
            writer.close()

            print(f"Saved video to {output_path} with {num_output_frames} frames at {output_resolution} resolution and {fps} fps.")

    def reconstruct_videos(self, video_paths: List[str], output_paths: List[str]):
        batch_size = self.config.batch_size
        for dataset_name, custom_size in self.config.custom_batch_size.items():
            if any(dataset_name in path for path in video_paths):
                batch_size = custom_size
                break

        for i in range(0, len(video_paths), batch_size):
            batch_video_paths = video_paths[i:i + batch_size]
            batch_output_paths = output_paths[i:i + batch_size]

            frames_tensor, fps_list, num_frames_list, resolutions = self.preprocess_videos(batch_video_paths)
            encoded = self.encode(frames_tensor)
            decoded = self.decode(encoded)
            self.save_videos(decoded, batch_output_paths, fps_list, num_frames_list, resolutions)


In [2]:
# * Testing code on H100 server
import os
import time
from datetime import datetime, timedelta

# Set Hugging Face home directory
os.environ["HF_HOME"] = "/jfs/jinjie/huggingface"
def format_timedelta(td):
    """Formats a timedelta object into HH:MM:SS string."""
    seconds = int(td.total_seconds())
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}"

from diffusers import AutoencoderKLHunyuanVideo


# Processing Script
if __name__ == "__main__":
    
    data_root="/jfs/jinjie"
    
    config = Config(
        model_class=AutoencoderKLHunyuanVideo,
        # model_path="THUDM/CogVideoX-2b",
        # model_path="/home/maij/.cache/huggingface/hub/models--THUDM--CogVideoX-5b/snapshots/8d6ea3f817438460b25595a120f109b88d5fdfad/vae",
        model_path=f"{data_root}/huggingface/hub/models--tencent--HunyuanVideo/snapshots/2a15b5574ee77888e51ae6f593b2ceed8ce813e5/vae",
        device="cuda:5",
        dtype="bfloat16", 
        batch_size=1,
        custom_batch_size={'imagenet_val': 1, 'textocr': 1, 'bridgedata_v2':1, 'panda_70m':1, 'real10k':1}, # * variable resolution
        source_base=f"{data_root}/data/vae_eval_bench/processed_gt_v3", 
        output_base=f"{data_root}/data/vae_eval_bench/model_recon/hunyuan_v2",
        # source_base="/mnt/data/jinjie/data/vae_eval_bench/processed_gt_v3",
        # output_base="/mnt/data/jinjie/data/vae_eval_bench/model_recon/cogvideox"
    )
    
    autoencoder = GeneralAutoEncoderKL(config)

    for dataset in os.listdir(config.source_base):
        dataset_source_path = os.path.join(config.source_base, dataset)
        dataset_output_path = os.path.join(config.output_base, dataset)

        if not os.path.isdir(dataset_source_path):
            continue

        print(f"Processing dataset: {dataset}")
        start_time = datetime.now()  # Start the timer
        
        os.makedirs(dataset_output_path, exist_ok=True)

        video_files = [f for f in os.listdir(dataset_source_path) if f.endswith('.mp4')]
        video_paths = [os.path.join(dataset_source_path, f) for f in video_files]
        output_paths = [os.path.join(dataset_output_path, f) for f in video_files]

        autoencoder.reconstruct_videos(video_paths, output_paths)

        end_time = datetime.now()  # Stop the timer
        time_taken = end_time - start_time
        formatted_time = format_timedelta(time_taken)

        print(f"Finished processing {dataset}. Time taken: {formatted_time}")
        
    print("All datasets have been processed successfully.")


Model loaded successfully from /jfs/jinjie/huggingface/hub/models--tencent--HunyuanVideo/snapshots/2a15b5574ee77888e51ae6f593b2ceed8ce813e5/vae.
Processing dataset: BDD100K


OutOfMemoryError: CUDA out of memory. Tried to allocate 100.86 GiB. GPU 5 has a total capacity of 79.11 GiB of which 61.75 GiB is free. Including non-PyTorch memory, this process has 17.36 GiB memory in use. Of the allocated memory 16.42 GiB is allocated by PyTorch, and 279.87 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

: 