<a href="https://colab.research.google.com/github/mark8888-android/projectvizion/blob/main/LTXV_0_9_7_13B_Distilled_I2V.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **IMAGE TO VIDEO WITH LTXV 0.9.7 13B DISTILLED**

- You can use the free T4 GPU to generate a 5-second video (120 frames) in about 10 minutes without upscaling.

- For upscaled videos, the T4 GPU can handle up to 1 second (25 frames).

- For longer or faster video generation with upscale, consider using higher-tier GPUs.

- All videos are generated at 24 FPS.

- To generate a video with n frames, set the frames value to n + 1. To create a 5-second video (120 frames), set frames = 121.

- The output video will match the resolution of the uploaded image.

- Enabling the upscale_video option will generate a second, higher-quality version of the video at twice the original resolution.

In [None]:
# @title Prepare Environment
# !pip install --upgrade --quiet torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

!pip install torch==2.6.0 torchvision==0.21.0

%cd /content
Use_t5xxl_fp16 = False

!pip install -q torchsde einops diffusers accelerate xformers==0.0.29.post2
!pip install av
!git clone --branch ComfyUI_v0.3.34 https://github.com/Isi-dev/ComfyUI
%cd /content/ComfyUI/custom_nodes
!git clone --branch forHidream https://github.com/Isi-dev/ComfyUI_GGUF.git
!git clone https://github.com/Isi-dev/ComfyUI_LTXVideo
%cd /content/ComfyUI/custom_nodes/ComfyUI_GGUF
!pip install -r requirements.txt
%cd /content/ComfyUI/custom_nodes/ComfyUI_LTXVideo
!pip install -r requirements.txt
%cd /content/ComfyUI
!apt -y install -qq aria2 ffmpeg
from IPython.display import clear_output
clear_output()



import torch
import numpy as np
from PIL import Image
import gc
import sys
import random
import os
import imageio
from google.colab import files
from IPython.display import display, HTML, Image as IPImage
sys.path.insert(0, '/content/ComfyUI')

from comfy import model_management

from nodes import (
    CheckpointLoaderSimple,
    CLIPLoader,
    CLIPTextEncode,
    VAELoader,
    VAEDecode,
    VAEDecodeTiled,
    LoadImage,
    ImageScale,
    SaveImage
)


from custom_nodes.ComfyUI_GGUF.nodes import (
    UnetLoaderGGUF
)

from comfy_extras.nodes_custom_sampler import (
    KSamplerSelect,
    SamplerCustom,
    RandomNoise
)

from comfy_extras.nodes_lt import (
    LTXVPreprocess,
    LTXVImgToVideo,
    LTXVScheduler,
    LTXVConditioning
)

from custom_nodes.ComfyUI_LTXVideo.stg import STGGuiderAdvancedNode

from custom_nodes.ComfyUI_LTXVideo.easy_samplers import LTXVBaseSampler

from custom_nodes.ComfyUI_LTXVideo.latent_upsampler import (
    LTXVLatentUpsamplerModelLoader,
    LTXVLatentUpsampler
)

from custom_nodes.ComfyUI_LTXVideo.latent_adain import LTXVAdainLatent

from custom_nodes.ComfyUI_LTXVideo.tiled_sampler import LTXVTiledSampler

from custom_nodes.ComfyUI_LTXVideo.film_grain import LTXVFilmGrain


!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/wsbagnsv1/ltxv-13b-0.9.7-distilled-GGUF/resolve/main/ltxv-13b-0.9.7-distilled-Q6_K.gguf -d /content/ComfyUI/models/diffusion_models -o ltxv-13b-0.9.7-distilled-Q6_K.gguf
if Use_t5xxl_fp16:
    !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp16.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp16.safetensors
else:
    !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Isi99999/LTX-Video/resolve/main/t5xxl_fp8_e4m3fn_scaled.safetensors -d /content/ComfyUI/models/text_encoders -o t5xxl_fp8_e4m3fn_scaled.safetensors

!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/wsbagnsv1/ltxv-13b-0.9.7-dev-GGUF/resolve/main/ltxv-13b-0.9.7-vae-BF16.safetensors -d /content/ComfyUI/models/vae -o ltxv-13b-0.9.7-vae-BF16.safetensors

!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Lightricks/LTX-Video/resolve/main/ltxv-spatial-upscaler-0.9.7.safetensors -d /content/ComfyUI/models/upscale_models -o ltxv-spatial-upscaler-0.9.7.safetensors

# !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/Lightricks/LTX-Video/resolve/main/ltxv-temporal-upscaler-0.9.7.safetensors -d /content/ComfyUI/models/upscale_models -o ltxv-temporal-upscaler-0.9.7.safetensors

clear_output()

def clear_gpu_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    for obj in list(globals().values()):
        if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
            del obj

    gc.collect()


def upload_image():
    """Handle image upload in Colab and store in /content/ComfyUI/input/"""
    from google.colab import files
    import os
    import shutil

    os.makedirs('/content/ComfyUI/input', exist_ok=True)

    uploaded = files.upload()

    # Move each uploaded file to ComfyUI input directory
    for filename in uploaded.keys():
        src_path = f'/content/ComfyUI/{filename}'
        dest_path = f'/content/ComfyUI/input/{filename}'

        shutil.move(src_path, dest_path)
        print(f"Image saved to: {dest_path}")
        return dest_path

    return None

def string_to_float(string):
        float_list = [float(x.strip()) for x in string.split(',')]
        return (float_list,)

def float_to_sigmas(float_list):
        return torch.tensor(float_list, dtype=torch.float32),

def image_width_height(image):
    if image.ndim == 4:
        _, height, width, _ = image.shape
    elif image.ndim == 3:
        height, width, _ = image.shape
    else:
        raise ValueError(f"Unsupported image shape: {image.shape}")
    return width, height

def generate_video(
    image_path: str = None,
    positive_prompt: str = "A red fox moving gracefully",
    negative_prompt: str = "low quality, worst quality",
    width: int = 768,
    height: int = 512,
    seed: int = 0,
    steps: int = 30,
    cfg_scale: float = 2.05,
    sampler_name: str = "euler",
    length: int = 24,  # Number of frames
    fps: int = 24,
    upscale_video: bool = False
):
    with torch.inference_mode():

        unet_loader = UnetLoaderGGUF()
        vae_loader = VAELoader()
        checkpoint_loader = CheckpointLoaderSimple()
        clip_loader = CLIPLoader()
        clip_encode_positive = CLIPTextEncode()
        clip_encode_negative = CLIPTextEncode()
        load_image = LoadImage()
        image_scaler = ImageScale()
        save_node = SaveImage()
        preprocess = LTXVPreprocess()
        img_to_video = LTXVImgToVideo()
        scheduler = LTXVScheduler()
        sampler_select = KSamplerSelect()
        random_noise = RandomNoise()
        conditioning = LTXVConditioning()
        sampler = SamplerCustom()
        vae_decode = VAEDecode()
        stg_guider_advanced = STGGuiderAdvancedNode()
        ltxv_base_sampler = LTXVBaseSampler()
        vae_decode_tiled = VAEDecodeTiled()
        upscale_model_loader = LTXVLatentUpsamplerModelLoader()
        latent_upsampler = LTXVLatentUpsampler()
        adain_latent = LTXVAdainLatent()
        tiled_sampler = LTXVTiledSampler()
        film_grain = LTXVFilmGrain()

        print("Loading Text_Encoder...")
        clip = clip_loader.load_clip("t5xxl_fp8_e4m3fn_scaled.safetensors", "ltxv", "default")[0]

        assert width % 32 == 0, "Width must be divisible by 32"
        assert height % 32 == 0, "Height must be divisible by 32"

        positive = clip_encode_positive.encode(clip, positive_prompt)[0]
        negative = clip_encode_negative.encode(clip, negative_prompt)[0]

        del clip
        torch.cuda.empty_cache()
        gc.collect()

        if image_path is None:
            print("Please upload an image file:")
            image_path = upload_image()
        if image_path is None:
            print("No image uploaded!")
        loaded_image = load_image.load_image(image_path)[0]
        # processed_image = preprocess.preprocess(loaded_image, 40)[0]

        width_int, height_int = image_width_height(loaded_image)

        if width == 0 and height == 0 :
            if width_int > height_int:
                width = 768
                height = 512
            elif width_int == height_int:
                width = 512
                height = 512
            else:
                width = 512
                height = 768


        print("Loading UNet model...")
        model = unet_loader.load_unet("ltxv-13b-0.9.7-distilled-Q6_K.gguf")[0]

        conditionedPositive, conditionedNegative = conditioning.append(positive, negative, 25.0)

        guider = stg_guider_advanced.get_guider(
            model,
            conditionedPositive,
            conditionedNegative,
            0.997,  # skip_steps_sigma_threshold
            True,    # cfg_star_rescale
            "1.0, 0.9933, 0.9850, 0.9767, 0.9008, 0.6180",  # sigmas
            "1,1,1,1,1,1",  # cfg_values
            "0,0,0,0,0,0",  # stg_scale_values
            "1, 1, 1, 1, 1, 1",  # stg_rescale_values
            "[25], [35], [35], [42], [42], [42]"  # stg_layers_indices
        )[0]

        print("Loading VAE...")
        vae = vae_loader.load_vae("ltxv-13b-0.9.7-vae-BF16.safetensors")[0]

        # video_output = img_to_video.generate(
        #     positive=positive,
        #     negative=negative,
        #     vae=vae,
        #     image=processed_image,
        #     width=width,
        #     height=height,
        #     length=length,
        #     batch_size=1
        # )

        # sigmas = scheduler.get_sigmas(steps, cfg_scale, 0.95, True, 0.1)[0]
        selected_sampler = sampler_select.get_sampler(sampler_name)[0]
        # conditioned = conditioning.append(video_output[0], video_output[1], 25.0)

        sigmas = float_to_sigmas(
            string_to_float("1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250, 0.4219, 0.0")[0]
        )[0]

        noise = random_noise.get_noise(seed)[0]

        loaded_image = image_scaler.upscale(
            loaded_image,
            "lanczos",
            width,
            height,
            "disabled"
        )[0]

        try:

            print("Generating video...")

            sampled=ltxv_base_sampler.sample(
                model,
                vae,
                width,
                height,
                length,
                guider,
                selected_sampler,
                sigmas,
                noise,
                optional_cond_images=loaded_image,
                optional_cond_indices="0",
                strength=0.8,
                crop="disabled",
                crf=30,
                blur=1
            )[0]

            # sampled = sampler.sample(
            #     model=model,
            #     add_noise=True,
            #     noise_seed=seed if seed != 0 else random.randint(0, 2**32),
            #     cfg=cfg_scale,
            #     positive=conditioned[0],
            #     negative=conditioned[1],
            #     sampler=selected_sampler,
            #     sigmas=sigmas,
            #     latent_image=video_output[2]
            # )[0]

            # model_management.soft_empty_cache()
            del model
            del guider
            del noise
            torch.cuda.empty_cache()
            gc.collect()


            try:
                print("Decodimg Latents...")
                decoded = vae_decode.decode(vae, sampled)[0].detach()
                # print(f"Decoded frames shape: {decoded.shape}")
                # print("Latents Decoded!")
                if upscale_video is False:
                    del vae
                    del sampled
                    torch.cuda.empty_cache()
                    gc.collect()

            except Exception as e:
                print(f"Error during decoding: {str(e)}")
                raise

            # Reshape to video frames (batch, frames, H, W, C)
            # decoded_frames = decoded.reshape(1, length, height, width, 3)

            # save_node.save_images(decoded, filename_prefix="video_frame")
            decoded = image_scaler.upscale(
                decoded,
                "lanczos",
                width_int,
                height_int,
                "disabled"
            )[0]

            output_path = "/content/output.mp4"
            frames_np = (decoded.cpu().numpy() * 255).astype(np.uint8)

            del decoded
            torch.cuda.empty_cache()
            gc.collect()

            with imageio.get_writer(output_path, fps=fps) as writer:
                for frame in frames_np:
                    writer.append_data(frame)

            print(f"\nBase Video generation complete! Displaying Video...")
            # print(f"Saved {len(decoded)} frames to ComfyUI output directory")
            # print(f"Video saved to: {output_path}")
            display_video(output_path)

            if upscale_video:
                model = unet_loader.load_unet("ltxv-13b-0.9.7-distilled-Q6_K.gguf")[0]
                upscale_model = upscale_model_loader.load_model(
                    "ltxv-spatial-upscaler-0.9.7.safetensors", True, False
                )[0]

                tiled_guider = stg_guider_advanced.get_guider(
                    model,
                    conditionedPositive,
                    conditionedNegative,
                    0.997,  # skip_steps_sigma_threshold
                    True,    # cfg_star_rescale
                    "1",     # sigmas
                    "1",     # cfg_values
                    "0",     # stg_scale_values
                    "1",     # stg_rescale_values
                    "[42]"   # stg_layers_indices
                )[0]

                tiled_sigmas = float_to_sigmas(
                    string_to_float("0.85, 0.7250, 0.6, 0.4219, 0.0")[0]
                )[0]

                upscaled_latents = latent_upsampler.upsample_latent(
                    sampled, upscale_model, vae
                )[0]

                adjusted_latents = adain_latent.batch_normalize(
                    upscaled_latents, sampled, 0.25
                )[0]

                del sampled
                del upscale_model
                del upscaled_latents
                torch.cuda.empty_cache()
                gc.collect()

                tiled_noise = random_noise.get_noise(seed)[0]

                loaded_image = image_scaler.upscale(
                    loaded_image,
                    "lanczos",
                    width,
                    height,
                    "disabled"
                )[0]

                print("Generating high-res video...")

                tiled_output, _ = tiled_sampler.sample(
                    model=model,
                    vae=vae,
                    noise=tiled_noise,
                    sampler=selected_sampler,
                    sigmas=tiled_sigmas,
                    guider=tiled_guider,
                    latents=adjusted_latents,
                    optional_cond_images=loaded_image,
                    horizontal_tiles=1,
                    vertical_tiles=1,
                    overlap=1,
                    latents_cond_strength=0.15,
                    boost_latent_similarity=False,
                    crop="disabled",
                    optional_cond_indices="0",
                    images_cond_strengths="0.9"
                )

                del model
                del tiled_guider
                del tiled_noise
                torch.cuda.empty_cache()
                gc.collect()

                upscaled_latents = tiled_output["samples"]

                latent_input = {
                    "samples": upscaled_latents  # Should be shape [1,4,num_frames,H,W]
                }

                print("Decoding tiles...")

                decoded_frames = vae_decode_tiled.decode(
                    vae, latent_input, width, 64, 64, 8
                )[0]

                decoded_frames = image_scaler.upscale(
                    decoded_frames,
                    "lanczos",
                    width_int*2,
                    height_int*2,
                    "disabled"
                )[0]

                # decoded_frames = film_grain.add_film_grain(
                #     decoded_frames, 0.01, 0.5
                # )[0]

                output_pathU = "/content/upscaled.mp4"
                frames_npu = (decoded_frames.cpu().numpy() * 255).astype(np.uint8)

                del vae
                del decoded_frames
                torch.cuda.empty_cache()
                gc.collect()

                with imageio.get_writer(output_pathU, fps=fps) as writer:
                    for frame in frames_npu:
                        writer.append_data(frame)

                print(f"\nHigh-res Video generation complete! Displaying Video...")
                # print(f"Saved {len(decoded)} frames to ComfyUI output directory")
                # print(f"Video saved to: {output_path}")
                display_video(output_pathU)

        except Exception as e:
            print(f"Error during video generation: {str(e)}")
            raise
        finally:
            clear_gpu_memory()


def display_video(video_path):
    """Display video in Colab notebook with proper HTML5 player"""
    from IPython.display import HTML
    from base64 import b64encode

    mp4 = open(video_path,'rb').read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

    display(HTML(f"""
    <video width=512 controls autoplay loop>
        <source src="{data_url}" type="video/mp4">
    </video>
    """))

print("✅ Environment Setup Complete!")

In [None]:
# @title Upload Image

file_uploaded = upload_image()
display_upload = False # @param {type:"boolean"}
if display_upload:
    if file_uploaded.lower().endswith(('.png', '.jpg', '.jpeg')):
        display(IPImage(filename=file_uploaded))
    else:
        print("The image format cannot be displayed.")

In [None]:
# @title Run Image to Video
positive_prompt = "The lady sits up and waves to the camera." # @param {"type":"string"}
negative_prompt = "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly" # @param {"type":"string"}
# width = 512 # @param {"type":"number"}
# height = 768 # @param {"type":"number"}
width = 0
height = 0
seed = 0 # @param {"type":"integer"}
steps = 20
# steps = 20 # @param {"type":"integer", "min":1, "max":100}
cfg_scale = 2.5
sampler_name="euler_ancestral" # @param ["uni_pc", "uni_pc_bh2", "ddim","euler", "euler_cfg_pp", "euler_ancestral", "euler_ancestral_cfg_pp", "heun", "heunpp2","dpm_2", "dpm_2_ancestral","lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_2s_ancestral_cfg_pp", "dpmpp_sde", "dpmpp_sde_gpu","dpmpp_2m", "dpmpp_2m_cfg_pp", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm","ipndm", "ipndm_v", "deis", "res_multistep", "res_multistep_cfg_pp", "res_multistep_ancestral", "res_multistep_ancestral_cfg_pp","gradient_estimation", "er_sde", "seeds_2", "seeds_3"]
frames = 121 # @param {"type":"integer", "min":1, "max":120}
upscale_video = False # @param {type:"boolean"}

import random
seed = seed if seed != 0 else random.randint(0, 2**32 - 1)
print(f"Using seed: {seed}")

# @title Run Video Generation
generate_video(
        image_path=file_uploaded,
        positive_prompt=positive_prompt,
        negative_prompt=negative_prompt,
        width=width,
        height=height,
        seed=seed,
        steps=steps,
        cfg_scale=cfg_scale,
        sampler_name=sampler_name,
        length=frames,
        upscale_video=upscale_video
)
clear_gpu_memory()