# Run RVC in Runpod
# NoteBook Created by Voiid (https://github.com/official-imvoiid)

📌 Recommended Setup:
- GPU: A6000 Ada  
- Storage: 70GB total  
  - 50GB on persistent  
  - 20GB on temporary


In [None]:
# CELL 1: Initial Setup

import os
import subprocess
import sys
from pathlib import Path

def run_command(cmd, check=True):
    """Run shell command and print output"""
    print(f"Running: {cmd}")
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if result.stdout:
        print(result.stdout)
    if result.stderr and result.returncode != 0:
        print(f"Error: {result.stderr}")
    if check and result.returncode != 0:
        raise Exception(f"Command failed: {cmd}")
    return result

# Create workspace with desired structure
workspace = Path("/workspace")
workspace.mkdir(exist_ok=True)
os.chdir(workspace)

!git clone https://github.com/kohya-ss/musubi-tuner.git
run_command("mkdir -p musubi-tuner/inputfolder")
run_command("mkdir -p musubi-tuner/outputfolder")

print("✅ Initial Setup complete!")


In [None]:
# CELL 2: Install Dependencies

!python -m pip install --upgrade pip

# Install PyTorch with CUDA support
print("📦 Installing PyTorch with CUDA support...")
!pip install torch torchvision --index-url https://download.pytorch.org/whl/cu124

# Install optional dependencies
print("📦 Installing optional dependencies...")
!pip install accelerate==1.6.0 av==14.0.1 bitsandbytes==0.45.4 diffusers==0.32.1 einops==0.7.0 huggingface-hub==0.30.0 opencv-python==4.10.0.84 "pillow>=10.2.0" safetensors==0.4.5 "sageattention>=1.0.6" toml==0.10.2 tqdm==4.67.1 transformers==4.46.3 voluptuous==0.15.2 ftfy==6.3.1 easydict==1.13 ascii-magic matplotlib tensorboard

print("✅ Installation complete!")

In [None]:
import os
os.chdir('/workspace/musubi-tuner')
print("📦 Installing requirements...")
!pip install -e .
print("✅ Installation complete!")

In [None]:
# CELL 3: Configure Accelerate for 48GB VRAM
from pathlib import Path

# Create accelerate config optimized for 48GB VRAM
config_dir = Path.home() / ".cache/huggingface/accelerate"
config_dir.mkdir(parents=True, exist_ok=True)

# Optimized config for 48GB VRAM
accelerate_config = """compute_environment: LOCAL_MACHINE
distributed_type: 'NO'
downcast_bf16: 'no'
gpu_ids: all
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 1
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
"""

with open(config_dir / "default_config.yaml", "w") as f:
    f.write(accelerate_config)
    
print("⚙️ Accelerate configured for 48GB VRAM single GPU training")

In [None]:
# CELL 4: Download Models to ckpt folder for 48GB VRAM

from huggingface_hub import hf_hub_download
from pathlib import Path
import os
import subprocess
import shutil

# Define workspace path (adjust this to your actual workspace location)
workspace = Path("/workspace")  # or Path.cwd() if you want current directory

# Define run_command function
def run_command(cmd):
    """Execute shell command"""
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"Command failed: {cmd}")
        print(f"Error: {result.stderr}")
    return result

# Model paths matching HunyuanVideo directory structure
ckpt_dir = workspace / "musubi-tuner/ckpt"
hunyuan_dir = ckpt_dir / "hunyuan-video-t2v-720p"

# Create directory structure
transformers_dir = hunyuan_dir / "transformers"
vae_dir = hunyuan_dir / "vae"
text_encoder_dir = ckpt_dir / "text_encoder"
text_encoder_2_dir = ckpt_dir / "text_encoder_2"

# Create directories if they don't exist
transformers_dir.mkdir(parents=True, exist_ok=True)
vae_dir.mkdir(parents=True, exist_ok=True)
text_encoder_dir.mkdir(parents=True, exist_ok=True)
text_encoder_2_dir.mkdir(parents=True, exist_ok=True)

# Model file paths
dit_path = transformers_dir / "mp_rank_00_model_states.pt"
vae_path = vae_dir / "pytorch_model.pt"
llm_path = text_encoder_dir / "model.safetensors"  # LLaMA text encoder
clip_path = text_encoder_2_dir / "model.safetensors"  # CLIP text encoder

print("📥 Downloading HunyuanVideo models for 48GB VRAM setup...")

# Download DiT model (main transformer)
if not dit_path.exists():
    print("Downloading DiT transformer model (this may take a while)...")
    try:
        hf_hub_download(
            repo_id="tencent/HunyuanVideo",
            filename="hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt",
            local_dir=str(ckpt_dir),
            local_dir_use_symlinks=False
        )
        print("✅ DiT model downloaded successfully")
    except Exception as e:
        print(f"HF download failed, trying wget: {e}")
        run_command(f"wget -O {dit_path} https://huggingface.co/tencent/HunyuanVideo/resolve/main/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt")

# Download VAE model
if not vae_path.exists():
    print("Downloading VAE model...")
    try:
        hf_hub_download(
            repo_id="tencent/HunyuanVideo",
            filename="hunyuan-video-t2v-720p/vae/pytorch_model.pt",
            local_dir=str(ckpt_dir),
            local_dir_use_symlinks=False
        )
        print("✅ VAE model downloaded successfully")
    except Exception as e:
        print(f"HF download failed, trying wget: {e}")
        run_command(f"wget -O {vae_path} https://huggingface.co/tencent/HunyuanVideo/resolve/main/hunyuan-video-t2v-720p/vae/pytorch_model.pt")

# Download LLaMA Text Encoder (text_encoder) 
    print("Downloading LLaMA text encoder...")
    try:
        # Try the Comfy-Org repackaged version first as it's more reliable
        print("Trying Comfy-Org repackaged version...")
        downloaded_file = hf_hub_download(
            repo_id="Comfy-Org/HunyuanVideo_repackaged",
            filename="split_files/text_encoders/llava_llama3_fp16.safetensors",
            local_dir=None,  # Don't preserve directory structure
            local_dir_use_symlinks=False
        )
        # Use shutil.copy2 instead of os.rename to handle cross-device links
        shutil.copy2(downloaded_file, llm_path)
        print("✅ LLaMA text encoder downloaded from Comfy-Org repo")
    except Exception as e:
        print(f"Comfy-Org download failed: {e}")
        # Try alternative LLaMA model
        try:
            print("Trying alternative LLaMA model...")
            downloaded_file = hf_hub_download(
                repo_id="Comfy-Org/HunyuanVideo_repackaged",
                filename="split_files/text_encoders/llama3_8b_instruct_fp16.safetensors",
                local_dir=None,
                local_dir_use_symlinks=False
            )
            shutil.copy2(downloaded_file, llm_path)
            print("✅ Alternative LLaMA text encoder downloaded")
        except Exception as e2:
            print(f"Alternative LLaMA download also failed: {e2}")
            # Try direct wget as last resort
            try:
                print("Trying direct wget download...")
                result = run_command(f"wget -O {llm_path} https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged/resolve/main/split_files/text_encoders/llava_llama3_fp16.safetensors")
                if result.returncode == 0:
                    print("✅ LLaMA text encoder downloaded via wget")
                else:
                    print("❌ All LLaMA download methods failed")
            except Exception as e3:
                print(f"Wget download failed: {e3}")

# Download CLIP Text Encoder (text_encoder_2) 
if not clip_path.exists():
    print("Downloading CLIP text encoder...")
    try:
        # Try the Comfy-Org repackaged version first
        print("Trying Comfy-Org repackaged version...")
        downloaded_file = hf_hub_download(
            repo_id="Comfy-Org/HunyuanVideo_repackaged",
            filename="split_files/text_encoders/clip_l.safetensors",
            local_dir=None,  # Don't preserve directory structure
            local_dir_use_symlinks=False
        )
        # Use shutil.copy2 instead of os.rename to handle cross-device links
        shutil.copy2(downloaded_file, clip_path)
        print("✅ CLIP text encoder downloaded from Comfy-Org repo")
    except Exception as e:
        print(f"Comfy-Org CLIP download failed: {e}")
        # Try direct wget as fallback
        try:
            print("Trying direct wget download...")
            result = run_command(f"wget -O {clip_path} https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged/resolve/main/split_files/text_encoders/clip_l.safetensors")
            if result.returncode == 0:
                print("✅ CLIP text encoder downloaded via wget")
            else:
                print("❌ All CLIP download methods failed")
        except Exception as e2:
            print(f"Wget download failed: {e2}")

# Verify downloads
print("\n🔍 Verifying downloads...")
models_status = {
    "DiT Transformer": dit_path.exists(),
    "VAE": vae_path.exists(), 
    "LLaMA Text Encoder": llm_path.exists(),
    "CLIP Text Encoder": clip_path.exists()
}

for model_name, exists in models_status.items():
    status = "✅" if exists else "❌"
    print(f"   {status} {model_name}")

if all(models_status.values()):
    print("\n🎉 All models downloaded successfully!")
else:
    print("\n⚠️  Some models failed to download. Check the errors above.")
    print("\nAlternative options:")
    print("1. Try running the script again (some downloads may be temporary failures)")
    print("2. Manually download missing files from:")
    print("   - https://huggingface.co/Comfy-Org/HunyuanVideo_repackaged")
    print("   - https://huggingface.co/tencent/HunyuanVideo")

print("\n📁 Final directory structure:")
print(f"   {ckpt_dir}/")
print(f"   ├── hunyuan-video-t2v-720p/")
print(f"   │   ├── transformers/")
print(f"   │   │   └── mp_rank_00_model_states.pt")
print(f"   │   └── vae/")
print(f"   │       └── pytorch_model.pt")
print(f"   ├── text_encoder/")
print(f"   │   └── model.safetensors")
print(f"   └── text_encoder_2/")
print(f"       └── model.safetensors")

# Additional helpful information
print(f"\n📊 Estimated total download size: ~26GB")
print(f"💾 Make sure you have sufficient disk space available")
if any(models_status.values()):
    print(f"🚀 You can now proceed with HunyuanVideo training/inference")

In [None]:
# CELL 5: Create Dataset Configuration Template

dataset_config = """# Dataset configuration for HunyuanVideo training
# Place your videos and captions in /workspace/musubi-tuner/inputfolder/

[general]
resolution = [768, 768]
caption_extension = ".txt"
batch_size = 1
enable_bucket = true
bucket_no_upscale = false

[[datasets]]
video_directory = "/workspace/musubi-tuner/inputfolder"
caption_extension = ".txt"
num_repeats = 1
frame_extraction = "head"
cache_directory = "/workspace/musubi-tuner/outputfolder"
"""

# Write dataset config
dataset_config_path = workspace / "musubi-tuner/dataset.toml"
with open(dataset_config_path, "w") as f:
    f.write(dataset_config)

print(f"✅ Dataset configuration created at: {dataset_config_path}")


In [None]:
# CELL 6: Create Training Scripts with Direct Correct Paths 

# Cache latents script 
cache_latents_script = f"""#!/bin/bash
cd /workspace/musubi-tuner
python src/musubi_tuner/cache_latents.py \\
    --dataset_config dataset.toml \\
    --vae ckpt/hunyuan-video-t2v-720p/vae/pytorch_model.pt \\
    --vae_chunk_size 64 \\
    --vae_tiling
"""

# Cache text encoder outputs script
cache_text_script = f"""#!/bin/bash
cd /workspace/musubi-tuner
python src/musubi_tuner/cache_text_encoder_outputs.py \\
    --dataset_config dataset.toml \\
    --text_encoder1 ckpt/text_encoder/model.safetensors \\
    --text_encoder2 ckpt/text_encoder_2/model.safetensors \\
    --batch_size 32
"""

# Training script optimized for 42GB VRAM 
train_script = f"""#!/bin/bash
cd /workspace/musubi-tuner
accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 src/musubi_tuner/hv_train_network.py \\
    --dit ckpt/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt \\
    --dataset_config dataset.toml \\
    --sdpa \\
    --mixed_precision bf16 \\
    --fp8_base \\
    --optimizer_type adamw8bit \\
    --learning_rate 1e-4 \\
    --lr_scheduler cosine_with_restarts \\
    --lr_warmup_steps 100 \\
    --gradient_checkpointing \\
    --max_data_loader_n_workers 4 \\
    --persistent_data_loader_workers \\
    --network_module networks.lora \\
    --network_dim 64 \\
    --network_alpha 32 \\
    --timestep_sampling shift \\
    --discrete_flow_shift 7.0 \\
    --max_train_epochs 500 \\
    --save_every_n_epochs 50 \\
    --gradient_accumulation_steps 2 \\
    --max_grad_norm 1.0 \\
    --seed 42 \\
    --output_dir outputfolder \\
    --output_name Hunyuan-lora \\
    --logging_dir logs
"""

# Write scripts
scripts = [
    ("cache_latents.sh", cache_latents_script),
    ("cache_text_encoders.sh", cache_text_script), 
    ("train.sh", train_script)
]

for script_name, script_content in scripts:
    script_path = workspace / "musubi-tuner" / script_name
    with open(script_path, "w") as f:
        f.write(script_content)
    run_command(f"chmod +x {script_path}")
    print(f"✅ Created executable script: {script_name}")

print("\n📁 Using direct paths based on your actual directory structure:")
print("├── VAE: ckpt/hunyuan-video-t2v-720p/vae/pytorch_model.pt")
print("├── DIT: ckpt/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt") 
print("├── Text Encoder 1: ckpt/text_encoder/model.safetensors")
print("└── Text Encoder 2: ckpt/text_encoder_2/model.safetensors")

In [None]:
# start cache latents
import os
os.chdir('/workspace/musubi-tuner')
print("📦 Installing requirements...")
!./cache_latents.sh

In [None]:
# start cache textencoder
import os
os.chdir('/workspace/musubi-tuner')
print("📦 Installing requirements...")
!./cache_text_encoders.sh

In [None]:
# start training
import os
os.chdir('/workspace/musubi-tuner')
print("📦 Installing requirements...")
!./train.sh

In [None]:
# Use Lora To Generate Videos

# for 5 second video (--video_length 121)

  # --video_length Frames (length of your video) 
  # formula 4 x n + 1
  # ig for 30 sec video 30 x 24 FPS +1 = 721 Frames
  # ig for 5 sec ideo 5 x 24 FPS +1 = 121 Frames

  # Please upload All your lora models at /workspace/musubi-tuner/ckpt/text_encoder_2/

!python hv_generate_video.py \
  --fp8 \
  --video_size 544 960 \
  --video_length 121 \
  --infer_steps 30 \
  --prompt "Your Prompt here" \
  --save_path /workspace/musubi-tuner/output \
  --output_type both \
  --dit /workspace/musubi-tuner/ckpt/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt \
  --vae /workspace/musubi-tuner/ckpt/hunyuan-video-t2v-720p/vae/pytorch_model.pt \
  --text_encoder1 /workspace/musubi-tuner/ckpt/text_encoder/model.safetensors \
  --text_encoder2 /workspace/musubi-tuner/ckpt/text_encoder_2/model.safetensors \
  --lora_weight /workspace/musubi-tuner/outputfolder/Name_of_your_Lora_here.safetensors \
  --lora_multiplier 1.0 \
  --attn_mode sdpa \
  --split_attn \
  --vae_chunk_size 32 \
  --vae_spatial_tile_sample_min_size 128 \
  --seed 1234