In [None]:
import torch
from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
from diffusers.utils import export_to_video

# * note : we need to use 

model_id = "tencent/HunyuanVideo"
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
    model_id, subfolder="transformer", torch_dtype=torch.bfloat16, revision='refs/pr/18'
)
pipe = HunyuanVideoPipeline.from_pretrained(model_id, transformer=transformer, revision='refs/pr/18', torch_dtype=torch.float16)

pipe.vae.enable_tiling()
pipe.to("cuda")

output = pipe(
    prompt="A cat walks on the grass, realistic",
    height=320,
    width=512,
    num_frames=61,
    num_inference_steps=30,
).frames[0]

# video frames is in [0,255]
export_to_video(output, "output.mp4", fps=15)

In [2]:
import diffusers
print(diffusers.__version__)


0.31.0


In [None]:


# Import necessary libraries
import os
import torch
import imageio
import numpy as np
# from diffusers import AutoencoderKLHunyuan
from diffusers import AutoencoderKLHunyuanVideo
from torchvision import transforms
from typing import Tuple
import logging

# Suppress specific imageio FFmpeg warnings
logging.getLogger('imageio_ffmpeg').setLevel(logging.ERROR)

# Configuration Class
class Config:
    """
    Configuration class to manage parameters for Hunyuan operations.
    """
    def __init__(
        self,
        model_path: str,
        device: str = "cuda",
        dtype: str = "float16",
        source_base: str = "../resources/videos/",
        output_base: str = "./output_videos/"
    ):
        """
        Initializes the configuration with default or specified parameters.
        
        Parameters:
        - model_path (str): Path to the Hunyuan model.
        - device (str): Computation device ('cuda' or 'cpu').
        - dtype (str): Data type for computation ('float16' or 'bfloat16').
        - source_base (str): Path to the base folder containing source datasets.
        - output_base (str): Path to the base folder where output datasets will be saved.
        """
        self.model_path = model_path
        self.device = device
        self.dtype = torch.float16 if dtype == "float16" else torch.bfloat16
        self.source_base = source_base
        self.output_base = output_base

# Hunyuan Class
class Hunyuan:
    """
    A class to handle encoding and decoding of videos using the Hunyuan-2b VAE model.
    """
    def __init__(self, config: Config):
        """
        Initializes the Hunyuan model based on the provided configuration.
        
        Parameters:
        - config (Config): Configuration object containing necessary parameters.
        """
        self.config = config
        self.device = torch.device(self.config.device)
        self.dtype = self.config.dtype
        
        # Ensure output base directory exists
        os.makedirs(self.config.output_base, exist_ok=True)
        
        # Load the pre-trained model
        self.model = AutoencoderKLHunyuanVideo.from_pretrained(
            self.config.model_path, 
            torch_dtype=self.dtype
        ).to(self.device)
        print(f"Model loaded successfully from {self.config.model_path}.")
        
        # Enable optimizations
        self.model.enable_slicing()
        self.model.enable_tiling()
        
        # Define transformation
        self.transform = transforms.ToTensor()
    
    def preprocess_video(self, video_path: str) -> Tuple[torch.Tensor, float, int, Tuple[int, int]]:
        """
        Reads a video file and converts it into a tensor, extracting metadata.
        
        Parameters:
        - video_path (str): Path to the video file.
        
        Returns:
        - Tuple[torch.Tensor, float, int, Tuple[int, int]]: 
            - Preprocessed video tensor of shape (B, C, T, H, W).
            - Frames per second (fps) of the input video.
            - Number of frames in the input video.
            - Resolution (height, width) of the input video.
        """
        video_reader = imageio.get_reader(video_path, "ffmpeg")
        meta_data = video_reader.get_meta_data()
        fps = meta_data.get('fps', 30)  # Default to 30 if fps not found
        
        frames = [self.transform(frame) for frame in video_reader]
        video_reader.close()
        
        if not frames:
            raise ValueError(f"No frames found in video: {video_path}")
        
        num_frames = len(frames)
        resolution = frames[0].shape[1], frames[0].shape[2]  # (Height, Width)
        
        frames_tensor = torch.stack(frames).to(self.device).permute(1, 0, 2, 3).unsqueeze(0).to(self.dtype)
        return frames_tensor, fps, num_frames, resolution
    
    def encode(self, frames_tensor: torch.Tensor) -> torch.Tensor:
        """
        Encodes video frames into latent representations.
        
        Parameters:
        - frames_tensor (torch.Tensor): Video frames tensor.
        
        Returns:
        - torch.Tensor: Encoded latent tensor.
        """
        with torch.no_grad():
            encoded_frames = self.model.encode(frames_tensor)[0].sample()
        return encoded_frames
    
    def decode(self, encoded_tensor: torch.Tensor) -> torch.Tensor:
        """
        Decodes latent representations back into video frames.
        
        Parameters:
        - encoded_tensor (torch.Tensor): Encoded latent tensor.
        
        Returns:
        - torch.Tensor: Decoded video frames tensor.
        """
        with torch.no_grad():
            decoded_frames = self.model.decode(encoded_tensor).sample
        return decoded_frames
    
    def save_video(self, tensor: torch.Tensor, output_path: str, fps: float, original_num_frames: int, resolution: Tuple[int, int]):
        """
        Saves the decoded video frames to a video file and checks frame consistency.
        
        Parameters:
        - tensor (torch.Tensor): Decoded video frames tensor.
        - output_path (str): Path to save the output video.
        - fps (float): Frames per second for the output video.
        - original_num_frames (int): Number of frames in the input video.
        - resolution (Tuple[int, int]): Resolution (height, width) of the input video.
        """
        tensor = tensor.to(dtype=torch.float32)
        frames = tensor[0].permute(1, 2, 3, 0).cpu().numpy()  # (T, H, W, C)
        frames = np.clip(frames, 0, 1) * 255
        frames = frames.astype(np.uint8)
        
        num_output_frames = frames.shape[0]
        assert num_output_frames == original_num_frames, (
            f"Frame count mismatch: input {original_num_frames} vs output {num_output_frames}"
        )
        
        # Check if resolution matches
        output_resolution = frames.shape[1], frames.shape[2]
        assert output_resolution == resolution, (
            f"Resolution mismatch: input {resolution} vs output {output_resolution}"
        )
        
        writer = imageio.get_writer(output_path, fps=fps, codec='libx264')
        for frame in frames:
            writer.append_data(frame)
        writer.close()
        print(f"Saved decoded video to {output_path} with {num_output_frames} frames at {output_resolution} resolution and {fps} fps.")
    
    def reconstruct_video(self, video_path: str, output_path: str, idx: int):
        """
        Encodes and decodes a single video, then saves the reconstructed video.
        Supports only .mp4 videos.
        
        Parameters:
        - video_path (str): Path to the source video.
        - output_path (str): Path to save the reconstructed video.
        """
        file_extension = os.path.splitext(video_path)[1].lower()
        video_extensions = ['.mp4']
        
        if file_extension in video_extensions:
            frames_tensor, fps, num_frames, resolution = self.preprocess_video(video_path)
            if idx % 50 == 0:
                print(f"Processing {idx} Video: {video_path} | FPS: {fps} | Frames: {num_frames} | Resolution: {resolution}")
            encoded = self.encode(frames_tensor)
            decoded = self.decode(encoded)
            self.save_video(decoded, output_path, fps, num_frames, resolution)
        else:
            print(f"Unsupported file format: {file_extension}. Skipping file: {video_path}")
    
    def reconstruct_folder_videos(self, source_folder: str, output_folder: str):
        """
        Processes all videos in the source folder by encoding and decoding them,
        then saves the reconstructed videos to the output folder.
        
        Parameters:
        - source_folder (str): Path to the folder containing source videos.
        - output_folder (str): Path to the folder where reconstructed videos will be saved.
        """
        supported_video_extensions = ['.mp4']
        
        videos = [f for f in os.listdir(source_folder) 
                  if os.path.splitext(f)[1].lower() in supported_video_extensions]
        
        # Sort the videos lexicographically (works correctly for zero-padded filenames)
        videos = sorted(videos)
        
        if not videos:
            print(f"No supported videos found in source folder: {source_folder}")
            return
        
        for idx, video in enumerate(videos, 1):
            source_video_path = os.path.join(source_folder, video)
            output_video_path = os.path.join(output_folder, video)
            self.reconstruct_video(source_video_path, output_video_path, idx)
            
        print("Finished reconstructing all videos in the folder.")


In [None]:

# Processing Script Outside the Hunyuan Class
# This script iterates through each dataset folder and processes the videos accordingly.

# Initialize configuration
config = Config(
    model_path="/home/maij/.cache/huggingface/hub/models--tencent--HunyuanVideo/snapshots/2a15b5574ee77888e51ae6f593b2ceed8ce813e5/vae",  # Replace with your actual model path
    device="cuda:2",  # or "cpu"
    # TODO : we should report bfloat16 result, on v100 test we use fp16 for now
    dtype="float16",  # or "bfloat16"
    source_base="/home/maij/fall_2024/sora3r/Open-Sora/data/vae_eval_bench/processed_gt_v3",  # Replace with your actual source base folder path
    output_base="/home/maij/fall_2024/sora3r/Open-Sora/data/vae_eval_bench/model_recon/hunyuan"  # Replace with your desired output base folder path
)

# Initialize Hunyuan
cog_video = Hunyuan(config)

# Iterate through each dataset folder in the source base directory
for dataset in os.listdir(config.source_base):
    dataset_source_path = os.path.join(config.source_base, dataset)
    dataset_output_path = os.path.join(config.output_base, dataset)
    
    # Check if it's a directory
    if not os.path.isdir(dataset_source_path):
        continue  # Skip if not a directory
    
    print(f"Processing dataset: {dataset}")
    
    # Ensure the output dataset directory exists
    os.makedirs(dataset_output_path, exist_ok=True)
    
    # Reconstruct all videos in the current dataset folder
    cog_video.reconstruct_folder_videos(dataset_source_path, dataset_output_path)
    
print("All datasets have been processed successfully.")
