# ControlVideo

### Installation

Only for colab

In [None]:
!git clone --recurse-submodules https://github.com/rossiyareich/marching-waifu-x.git
%cd marching-waifu-x

Only for local install (run `setup-n.sh` or `setup.bat` and `huggingface-cli login` beforehand!)

In [None]:
%cd ../

Project setup

In [None]:
!nvidia-smi

# Install requirements
!python -m pip install -r requirements.txt
!python -m pip install -r ext/Real-ESRGAN/requirements.txt

# Create directories
!mkdir data/dataset/nerf/original/
!mkdir data/dataset/nerf/train/

# Install local packages
%cd ext/Real-ESRGAN/
!python setup.py develop
%cd ../../

In [None]:
%cd scripts

In [None]:
import sys

sys.path.append("..")

In [None]:
from huggingface_hub import notebook_login

notebook_login()

### Configuration

In [None]:
import json

controlvideo_conf = """
{
    "paths": {
        "out_path": "../data/dataset/nerf/original/",
        "cache_dir": "../data/checkpoints/",
        "ifnet_path": "../data/checkpoints/flownet.pkl",
        "condition_path": "../data/dataset/conditioning/",
        "textual_inversion_path": "../data/embeddings/"
    },
    "repositories": {
        "sd": "rossiyareich/Nabylon-v1.0-fp16",
        "vae": "stabilityai/sd-vae-ft-mse"
    },
    "controlnet": {
        "scales": [0.9, 0.8, 0.8, 0.7],
        "exp": 0.825,
        "pipe": {
            "openpose_full": "lllyasviel/control_v11p_sd15_openpose",
            "depth": "lllyasviel/control_v11f1p_sd15_depth",
            "normals": "lllyasviel/control_v11p_sd15_normalbae",
            "lineart": "lllyasviel/control_v11p_sd15_lineart"
        }
    },
    "video": {
        "num_inference_steps": 20,
        "guidance_scale": 10.0,
        "smooth_steps": [14, 15],
        "seed": null,
        "same_frame_noise": false,
        "length": 64,
        "keyframes": {
            "frames": [0, 7, 15, 23, 31, 39, 47, 55, 63],
            "prompt": "(masterpiece, best quality)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
            "negative_prompt": "EasyNegative, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
        },
        "clips": [
            {
                "attn_frames": [0, 7],
                "clip_frames": [1, 2, 3, 4, 5, 6],
                "prompt": "(masterpiece, best quality)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "EasyNegative, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [7, 15],
                "clip_frames": [8, 9, 10, 11, 12, 13, 14],
                "prompt": "(masterpiece, best quality)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "EasyNegative, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [15, 23],
                "clip_frames": [16, 17, 18, 19, 20, 21, 22],
                "prompt": "(masterpiece, best quality)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "EasyNegative, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [23, 31],
                "clip_frames": [24, 25, 26, 27, 28, 29, 30],
                "prompt": "(masterpiece, best quality)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "EasyNegative, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [31, 39],
                "clip_frames": [32, 33, 34, 35, 36, 37, 38],
                "prompt": "(masterpiece, best quality)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "EasyNegative, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [39, 47],
                "clip_frames": [40, 41, 42, 43, 44, 45, 46],
                "prompt": "(masterpiece, best quality)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "EasyNegative, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [47, 55],
                "clip_frames": [48, 49, 50, 51, 52, 53, 54],
                "prompt": "(masterpiece, best quality)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "EasyNegative, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [55, 63],
                "clip_frames": [56, 57, 58, 59, 60, 61, 62],
                "prompt": "(masterpiece, best quality)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "EasyNegative, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            }
        ]
    }
}
"""

realesrgan_conf = """
{
    "paths": {
        "in_path": "../data/dataset/nerf/original/",
        "out_path": "../data/dataset/nerf/train/"
    },
    "upscale": {
        "outscale": 4.0,
        "tile": 192,
        "tile_pad": 10,
        "pre_pad": 10,
        "face_enhance": true,
        "fp32": false,
        "gpu_id": 0
    }
}
"""

with open("inference_controlvideo.json", "w") as f:
    f.write(controlvideo_conf)
with open("inference_realesrgan.json", "w") as f:
    f.write(realesrgan_conf)
controlvideo_conf = json.loads(controlvideo_conf)
realesrgan_conf = json.loads(realesrgan_conf)

### Inference

In [None]:
!python inference_controlvideo.py --settings_path "inference_controlvideo.json" 

In [None]:
!python inference_realesrgan.py --settings_path "inference_realesrgan.json"

In [None]:
import os
import glob

import cv2
import PIL.Image

from src.utils.image_wrapper import *


fps = 8.0
scale = 0.5
out_path = "../ipynb/controlvideo.mp4"
fourcc = cv2.VideoWriter_fourcc(*"MP4V")

# Load and scale images
images = sorted(glob.glob(os.path.join(realesrgan_conf["paths"]["out_path"], "*.png")))
images = [
    image_wrapper(PIL.Image.open(image), "pil").scale(scale).to_cv2()
    for image in images
]

# Convert images to video
video = cv2.VideoWriter(out_path, fourcc, fps, images[0].size)
for image in images:
    video.write(image)
video.release()
cv2.destroyAllWindows()

# Grounding DINO + Segment Anything