In [3]:
import os
from os import path as osp
from typing import Any, Dict
import PIL
import requests
import torch
from io import BytesIO

import cv2
from PIL import Image
import numpy as np
import open_clip
import torch
import torch.cuda.amp as amp
from einops import rearrange

from autoencoder import AutoencoderKL
from diffusion import (
    GaussianDiffusion, beta_schedule)
from unet_sd import UNetSD



import tempfile
from typing import Any, Dict, Optional

import cv2
import torch
from einops import rearrange
import numpy as np
import json
def config_to_dict(config):
    cfg = config['model']['model_cfg']
    model_args = config['model']['model_args']
    cfg['temporal_attention'] = True if cfg['temporal_attention'] == 'True' else False
    return cfg, model_args


__all__ = ['TextToVideoSynthesis']

#set torch seed 
torch.manual_seed(0)
class TextToVideoSynthesis():

    def __init__(self, model_dir, *args, **kwargs):

        self.device = torch.device('cuda') if torch.cuda.is_available() \
            else torch.device('cpu')
        with open(os.path.join(model_dir, 'configuration.json'), 'r') as f:
            config = json.load(f)
        cfg, self.model_args = config_to_dict(config)
        cfg['temporal_attention'] = True if cfg[
            'temporal_attention'] == 'True' else False

        # Initialize unet
        self.sd_model = UNetSD(
            in_dim=cfg['unet_in_dim'],
            dim=cfg['unet_dim'],
            y_dim=cfg['unet_y_dim'],
            context_dim=cfg['unet_context_dim'],
            out_dim=cfg['unet_out_dim'],
            dim_mult=cfg['unet_dim_mult'],
            num_heads=cfg['unet_num_heads'],
            head_dim=cfg['unet_head_dim'],
            num_res_blocks=cfg['unet_res_blocks'],
            attn_scales=cfg['unet_attn_scales'],
            dropout=cfg['unet_dropout'],
            temporal_attention=cfg['temporal_attention'])

        with amp.autocast(enabled=True):
            self.sd_model.load_state_dict(torch.load(
            osp.join(model_dir, self.model_args['ckpt_unet'])))

        self.sd_model.eval()
        self.sd_model.to(self.device)
        self.sd_model.half()

        # Initialize diffusion
        betas = beta_schedule(
            'linear_sd',
            cfg['num_timesteps'],
            init_beta=0.00085,
            last_beta=0.0120)
        self.diffusion = GaussianDiffusion(
            betas=betas,
            mean_type=cfg['mean_type'],
            var_type=cfg['var_type'],
            loss_type=cfg['loss_type'],
            rescale_timesteps=False)

        # Initialize autoencoder
        ddconfig = {
            'double_z': True,
            'z_channels': 4,
            'resolution': 256,
            'in_channels': 3,
            'out_ch': 3,
            'ch': 128,
            'ch_mult': [1, 2, 4, 4],
            'num_res_blocks': 2,
            'attn_resolutions': [],
            'dropout': 0.0
        }
        self.autoencoder = AutoencoderKL(
            ddconfig, 4,
            osp.join(model_dir, self.model_args['ckpt_autoencoder']))
        if self.model_args['tiny_gpu'] == 1:
            self.autoencoder.to('cpu')
        else:
            self.autoencoder.to(self.device)
        self.autoencoder.eval()
        self.autoencoder.half()

        # Initialize Open clip
        self.clip_encoder = FrozenOpenCLIPEmbedder(
            version=osp.join(model_dir,
                             self.model_args['ckpt_clip']),
            layer='penultimate')
        if self.model_args['tiny_gpu'] == 1:
            self.clip_encoder.to('cpu')
        else:
            self.clip_encoder.to(self.device)
        self.clip_encoder.eval()
        self.clip_encoder.half()

    def forward(self, input: Dict[str, Any]):
        r"""
        The entry function of text to image synthesis task.
        1. Using diffusion model to generate the video's latent representation.
        2. Using vqgan model (autoencoder) to decode the video's latent representation to visual space.

        Args:
            input (`Dict[Str, Any]`):
                The input of the task
        Returns:
            A generated video (as pytorch tensor).
        """
        y = input['text_emb']
        zero_y = input['text_emb_zero']
        context = torch.cat([zero_y, y], dim=0).to(self.device)
        # synthesis
        with torch.no_grad():
            num_sample = 1  # here let b = 1
            max_frames = 48
            latent_h, latent_w = 32, 64
            with amp.autocast(enabled=True):
                if input['latent_input'] is None:
                    init_latent = torch.randn(num_sample, 4, max_frames, latent_h,
                                        latent_w).to(self.device)# 1 was max_frames
                    gs=50
                    steps = 50
                else:   
                    init_latent = input['latent_input']
                    gs = input['guidance']
                    steps = input['steps']
                    
                x0 = self.diffusion.ddim_sample_loop(
                    noise=init_latent,  # shape: b c f h w
                    model=self.sd_model,
                    model_kwargs=[{
                        'y':
                        context[1].unsqueeze(0).repeat(num_sample, 1, 1)
                    }, {
                        'y':
                        context[0].unsqueeze(0).repeat(num_sample, 1, 1)
                    }],
                    guide_scale=gs,
                    ddim_timesteps=steps,
                    eta=0.0)

                scale_factor = 0.18215
                video_data = 1. / scale_factor * x0
                bs_vd = video_data.shape[0]
                video_data = rearrange(video_data, 'b c f h w -> (b f) c h w')
                self.autoencoder.to(self.device)

                video = torch.zeros((video_data.shape[0], 3, latent_h*8, latent_w*8))
                for i in range(0, video_data.shape[0]):
                    video[i] = self.autoencoder.decode(video_data[i].unsqueeze(0)).detach().cpu().squeeze()
                if self.model_args['tiny_gpu'] == 1:
                    self.autoencoder.to('cpu')
                video_data = rearrange(
                    video, '(b f) c h w -> b c f h w', b=bs_vd)
        return video_data.type(torch.float32), x0


class FrozenOpenCLIPEmbedder(torch.nn.Module):
    """
    Uses the OpenCLIP transformer encoder for text
    """
    LAYERS = ['last', 'penultimate']

    def __init__(self,
                 arch='ViT-H-14',
                 version='open_clip_pytorch_model.bin',
                 device='cuda',
                 max_length=77,
                 freeze=True,
                 layer='last'):
        super().__init__()
        assert layer in self.LAYERS
        model, _, _ = open_clip.create_model_and_transforms(
            arch, device=torch.device('cpu'), pretrained=version)
        del model.visual
        self.model = model

        self.device = device
        self.max_length = max_length
        if freeze:
            self.freeze()
        self.layer = layer
        if self.layer == 'last':
            self.layer_idx = 0
        elif self.layer == 'penultimate':
            self.layer_idx = 1
        else:
            raise NotImplementedError()

    def freeze(self):
        self.model = self.model.eval()
        for param in self.parameters():
            param.requires_grad = False

    def forward(self, text):
        tokens = open_clip.tokenize(text)
        z = self.encode_with_transformer(tokens.to(self.device))
        return z

    def encode_with_transformer(self, text):
        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
        x = x + self.model.positional_embedding
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.model.ln_final(x)
        return x

    def text_transformer_forward(self, x: torch.Tensor, attn_mask=None):
        for i, r in enumerate(self.model.transformer.resblocks):
            if i == len(self.model.transformer.resblocks) - self.layer_idx:
                break
            x = r(x, attn_mask=attn_mask)
        return x

    def encode(self, text):
        return self(text)

class TextToVideoSynthesisPipeline():

    def __init__(self, model: str, **kwargs):
        self.model = TextToVideoSynthesis(model, **kwargs)

    def preprocess(self, input, **preprocess_params) -> Dict[str, Any]:
        self.model.clip_encoder.to(self.model.device)
        text_emb = self.model.clip_encoder(input['text'])
        text_emb_zero = self.model.clip_encoder('')
        if self.model.config.model.model_args.tiny_gpu == 1:
            self.model.clip_encoder.to('cpu')
        return {'text_emb': text_emb, 'text_emb_zero': text_emb_zero}

    def forward(self, input: Dict[str, Any],
                **forward_params) -> Dict[str, Any]:
        video, latent = self.model.forward(input)
        return {'video': video}, latent

    def postprocess(self, inputs: Dict[str, Any],
                    **post_params) -> Dict[str, Any]:
        video = tensor2vid(inputs['video'])
        output_video_path = post_params.get('output_video', None)
        if output_video_path is None:
            output_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name

        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        h, w, c = video[0].shape
        video_writer = cv2.VideoWriter(
            output_video_path, fourcc, fps=8, frameSize=(w, h))
        for i in range(len(video)):
            img = cv2.cvtColor(video[i], cv2.COLOR_RGB2BGR)
            video_writer.write(img)
        return output_video_path

def tensor2vid(video, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]):
    mean = torch.tensor(
        mean, device=video.device).reshape(1, -1, 1, 1, 1)  # ncfhw
    std = torch.tensor(
        std, device=video.device).reshape(1, -1, 1, 1, 1)  # ncfhw
    video = video.mul_(std).add_(mean)  # unnormalize back to [0,1]
    video.clamp_(0, 1)
    images = rearrange(video, 'i c f h w -> f h (i w) c')
    images = images.unbind(dim=0)
    images = [(image.numpy() * 255).astype('uint8')
              for image in images]  # f h w c
    return images


pipeline = TextToVideoSynthesisPipeline('text-to-video-synthesis')


KeyError: 'ckpt_unet'

In [2]:
inp = pipeline.preprocess({'text': 'A cow running a marathon'})
inp['latent_input'] = None
vid, latent_inp = pipeline.forward(inp)
out = pipeline.postprocess(vid, output_video='test.mp4')

In [6]:
inp = {}
inp['text_emb'], inp['text_emb_zero'] = pipeline.preprocess({'text': 'A cow running a marathon in hell'}).values()
inp['latent_input'] = latent_inp*0.1 + 0.9*torch.randn(1, 4, 48, 32,64).to('cuda')#ok
inp['guidance'] = 50
inp['steps'] = 50
vid, _ = pipeline.forward(inp)
out = pipeline.postprocess(vid, output_video='dog.mp4')

In [22]:
import PIL
import requests
import torch
#open  vtv/chicken.mp4

import cv2
import numpy as np
from PIL import Image
frames = []
cap = cv2.VideoCapture('vtv/chicken.mp4')
#open at 8 fps
cap.set(cv2.CAP_PROP_FPS, 8)
while(cap.isOpened()):
    ret, frame = cap.read()
    #convert to rgb    

    if ret == True:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(Image.fromarray(frame).crop((0, 0, 512, 256)))
    else:
        break

f = frames[::3][:48]
import cv2
from PIL import Image
import numpy as np
encoded_frames = []

for frame in f:
    frame = torch.tensor(np.array(f[0])).unsqueeze(0).transpose(1,3).transpose(2,3)
    o = model.autoencoder.to('cuda').encode(frame.cuda().half())
    o = o.sample().unsqueeze(2).cpu().detach()
    #normalize o based on mean and std of the latent space
    o = (o - o.mean()) / o.std()#the latents are good once normalized
    encoded_frames.append(o)

latent_inp = torch.cat(encoded_frames, dim=2).cuda().half()

model.autoencoder.to('cpu')
del o
torch.cuda.empty_cache()


In [24]:
inp = {}
inp['text_emb'], inp['text_emb_zero'] = pipeline.preprocess({'text': 'A person eating a chicken'}).values()
inp['latent_input'] = latent_inp*0.01 + 0.99*torch.randn(1, 4, 48, 32,64).to('cuda')#ok
inp['guidance'] = 10
inp['steps'] = 50
vid, _ = pipeline.forward(inp)
out = pipeline.postprocess(vid, output_video='dog.mp4')

In [8]:
inp = pipeline.preprocess({'text': 'A cow running a marathon'})
inp['latent_input'] = None
vid, latent_inp = pipeline.forward(inp)
out = pipeline.postprocess(vid, output_video='test.mp4')

In [32]:
inp = pipeline.preprocess({'text': 'A cow running a marathon'})

In [35]:
mask = np.array(Image.open("mask.PNG").resize((64, 32)))
init_latent = torch.randn(1, 4, 48, 32,64)
init_latent[:, :, :, mask > 0] *= -1

In [2]:
import PIL
import requests
import torch
from io import BytesIO

import cv2
from PIL import Image
import numpy as np
from diffusers import StableDiffusionInpaintPipeline

vid = cv2.VideoCapture('test.mp4')
pipe = StableDiffusionInpaintPipeline.from_pretrained(
    "stabilityai/stable-diffusion-2-inpainting", torch_dtype=torch.float16
)
pipe.safety_checker = None
pipe = pipe.to("cuda")
ret, frame = vid.read()
i = 0
while ret:
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame = Image.fromarray(frame)

    mask = np.array(Image.open("mask_hard.PNG"))
    mask = Image.fromarray(mask//2)


    init_image = frame.resize((512, 512))
    mask_image = mask.resize((512, 512))

    prompt = 'A cow running a marathon'
    image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
    image = image.resize((512, 256))
    #save image to file
    image.save(f'cowupscale/{i}.png')

    ret, frame = vid.read()
    i += 1

Fetching 13 files: 100%|██████████| 13/13 [00:00<00:00, 151882.87it/s]
100%|██████████| 50/50 [00:05<00:00,  9.70it/s]
100%|██████████| 50/50 [00:05<00:00,  9.68it/s]
100%|██████████| 50/50 [00:05<00:00,  9.65it/s]
100%|██████████| 50/50 [00:05<00:00,  9.62it/s]
 20%|██        | 10/50 [00:01<00:05,  7.88it/s]


KeyboardInterrupt: 

In [1]:
# mask = np.array(Image.open("mask.PNG").resize((64, 32)))
# init_latent[:, :, :, mask > 0] *= 0.5

NameError: name 'model' is not defined

In [None]:
import PIL
import requests
import torch
from io import BytesIO

import cv2
from PIL import Image
import numpy as np
mask = torch.tensor(np.array(Image.open("mask.PNG")))
m = mask.unsqueeze(0).transpose(1,3).transpose(2,3)
aut = model.autoencoder.to('cuda')
o = aut.encode(m.cuda().half())
o.sample()

In [6]:
import PIL
import requests
import torch
from io import BytesIO

from diffusers import StableDiffusionInpaintPipeline


def download_image(url):
     response = requests.get(url)
     return PIL.Image.open(BytesIO(response.content)).convert("RGB")


img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"

init_image = download_image(img_url).resize((512, 256))
mask_image = download_image(mask_url).resize((512, 256))

pipe = StableDiffusionInpaintPipeline.from_pretrained(
     "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
 )
pipe = pipe.to("cuda")

prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]

vae/diffusion_pytorch_model.safetensors not found
Fetching 16 files: 100%|██████████| 16/16 [00:00<00:00, 233016.89it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

torch.Size([2, 4, 64, 64])





RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 64 but got size 32 for tensor number 2 in the list.

In [1]:
from diffusers import StableDiffusionInpaintPipeline
import cv2
from PIL import Image
import numpy as np
pipe = StableDiffusionInpaintPipeline.from_pretrained(
    "runwayml/stable-diffusion-inpainting",
)


vid = cv2.VideoCapture('test.mp4')
for i in range(15):
    ret, frame = vid.read()
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
fra
#open mask as rgb
mask = np.array(Image.open("mask_256_512.PNG"))


prompt = "A cow running a marathon"
#image and mask_image should be PIL images.
#The mask structure is white for inpainting and black for keeping as is
image = pipe(prompt=prompt, image=frame, mask_image=mask).images[0]
image.save("./out.png")

  from .autonotebook import tqdm as notebook_tqdm
2023-03-27 19:16:53.371368: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/nepyope/.local/lib/python3.10/site-packages/nvidia/cuda_runtime/lib:/home/nepyope/.local/lib/python3.10/site-packages/nvidia/cuda_runtime/lib
2023-03-27 19:16:53.371436: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/nepyope/.local/lib/python3.10/site-packages/nvidia/cuda_runtime/lib:/home/nepyope/.local/lib/python3.10/site-packages/nvidia/cuda_runtime/lib
unet/diffusion_pytorch_model.safetensors not found
Fetching 16 files: 100%|██████████| 16/16 [00:00<00:00, 159403.48it/s]
  

torch.Size([2, 4, 64, 64])





RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 64 but got size 32 for tensor number 2 in the list.

In [17]:
np.zeros_like(frame)[:,:,0].shape

(256, 512)

In [5]:
#get the first image from test.mp4
import cv2

vid = cv2.VideoCapture('test.mp4')
for i in range(15):
    ret, frame = vid.read()
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = Image.fromarray(frame)
frame.show()

In [11]:
#open mask.PNG
mask = Image.open('mask.PNG')
#only keep color blue
mask = np.array(mask)
mask = mask[:,:,2]
mask = Image.fromarray(mask)
#only keep 255 white
mask = np.array(mask)
mask[mask<255] = 0
mask = Image.fromarray(mask)
mask.show()

In [9]:
np.array(mask)

array([[[146, 155,  99, 255],
        [146, 155,  99, 255],
        [146, 155,  99, 255],
        ...,
        [143, 137, 112, 255],
        [143, 137, 112, 255],
        [143, 137, 112, 255]],

       [[147, 156, 100, 255],
        [147, 156, 100, 255],
        [147, 156, 100, 255],
        ...,
        [155, 149, 124, 255],
        [155, 149, 124, 255],
        [155, 149, 124, 255]],

       [[147, 153, 101, 255],
        [147, 153, 101, 255],
        [147, 153, 101, 255],
        ...,
        [156, 151, 123, 255],
        [156, 151, 123, 255],
        [156, 151, 123, 255]],

       ...,

       [[184, 165, 183, 255],
        [184, 165, 183, 255],
        [184, 165, 183, 255],
        ...,
        [187, 155, 180, 255],
        [186, 154, 181, 255],
        [184, 152, 179, 255]],

       [[186, 167, 185, 255],
        [186, 167, 185, 255],
        [186, 167, 185, 255],
        ...,
        [198, 166, 191, 255],
        [197, 165, 190, 255],
        [194, 162, 187, 255]],

       [[198

In [None]:
inp = pipeline.preprocess({'text': 'A dragon breathing fire'})
inp['latent_input'] = None
vid = pipeline.forward(inp)
out = pipeline.postprocess(vid, output_video='test.mp4')

In [2]:
inp = pipeline.preprocess({'text': 'A dog jumping up and down'})
inp['latent_input'] = None
vid = pipeline.forward(inp)
out = pipeline.postprocess(vid, output_video='test.mp4')

In [4]:
inp['latent_input'] = latent_inp*0.2 + 0.8*torch.randn(1, 4, 16, 32,32).to('cuda')#ok
inp['guidance'] = 10
inp['steps'] = 50
vid, _ = pipeline.forward(inp)
out = pipeline.postprocess(vid, output_video='dog.mp4')

In [10]:
noise = torch.randn(1, 4, 16, 32,32).to('cuda')
c = torch.cat((noise[:,:,:8,:,:]*0.7+latent_inp[:,:,8:,:,:]*0.3, noise[:,:,8:,:,:]), dim=2)#70% of the first half of the latent code is from the previous frame, 30% is from the noise

In [17]:
noise = torch.randn(1, 4, 16, 32,32).to('cuda')
for i in range(10):
    noise[:, :, i, :, :] = latent_inp[:, :, i, :, :]*(1/(i+1))+noise[:, :, i, :, :]*(1-1/(i+1))#gradually add noise to the first half of the latent code

c = noise

In [19]:
inp['text_embed'], inp['text_input'] = pipeline.preprocess({'text': 'A brown dog eating a log in a garden'})
inp['latent_input'] = c
inp['guidance'] = 5
inp['steps'] = 50
vid, _ = pipeline.forward(inp)
out = pipeline.postprocess(vid, output_video='dog_continue.mp4')