### Run SVD text to video generation

In [None]:
import torch

from pipelines.pipeline_stable_video_diffusion_text import StableVideoDiffusionPipeline
from diffusers import UNetSpatioTemporalConditionModel
from diffusers.utils import load_image, export_to_video
from transformers import CLIPTextModel, CLIPTokenizer
from xtend import EmbeddingProjection

tokenizer = CLIPTokenizer.from_pretrained(
    'laion/CLIP-ViT-H-14-laion2B-s32B-b79K'
)
text_encoder = CLIPTextModel.from_pretrained(
    'laion/CLIP-ViT-H-14-laion2B-s32B-b79K', torch_dtype=torch.float16
)
unet = UNetSpatioTemporalConditionModel.from_pretrained(
    "/18940970966/diffusion_re/NEW_CODE/SVD_Xtend/outputs/checkpoint-11000/",
    subfolder="unet",
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
)
pipe = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt",
    unet=unet,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    torch_dtype=torch.float16, variant="fp16", local_files_only=True,
)
pipe.to("cuda")

In [None]:
pipe.unet.embedding_projection = EmbeddingProjection(1024, 1024).cuda()

In [None]:
from safetensors import safe_open

tensors = {}
with safe_open("/18940970966/diffusion_re/NEW_CODE/SVD_Xtend/outputs/checkpoint-11000/unet/diffusion_pytorch_model.safetensors", framework="pt", device=0) as f:
    for k in f.keys():
        if 'embedding_projection' in k:
            tensors[k.replace('embedding_projection.', '')] = f.get_tensor(k)

In [None]:
tensors.keys()

In [None]:
pipe.unet.embedding_projection.load_state_dict(tensors)

In [None]:
image = load_image('/18940970966/diffusion_re/NEW_CODE/SVD_Xtend/bdd100k/images/track/train/00a0f008-a315437f/00a0f008-a315437f-0000002.jpg')
image = image.resize((1024, 576))

generator = torch.manual_seed(123)
frames = pipe(image,
              prompt='a car driving on the road',
              negative_prompt='',
              height=576, 
              width=1024, 
              num_frames=16,
              max_guidance_scale=10,
              min_guidance_scale=7,
              decode_chunk_size=8, generator=generator, motion_bucket_id=0, fps=7, noise_aug_strength=0.02, num_inference_steps=50).frames[0]

export_to_video(frames, "generated.mp4", fps=7)