### Setup

Run `conda run -n marching-waifu-x --no-capture-output setup.bat` (once) beforehand

In [None]:
# Setup torch and other prerequisites
%cd ..
!python -m pip install --upgrade pip
!python -m pip install gdown ipykernel ipywidgets
!python -m pip install torch torchvision torchaudio torchtext torchdata --index-url https://download.pytorch.org/whl/cu118

# Update submodules
!git submodule update --init --recursive

# Create directories
!mkdir data/dataset/nvdiffrec/original
!mkdir data/dataset/nvdiffrec/upscaled
!mkdir data/dataset/nvdiffrec/train

# Install requirements
!python -m pip install -r requirements.txt

# Setup RealESRGAN
%cd ext/Real-ESRGAN
!python -m pip install -r requirements.txt
!python setup.py develop
%cd ../..

# Setup SegmentAnything, GroundingDINO
%cd ext/Grounded-Segment-Anything
!python -m pip install -e segment_anything
!python -m pip install -e GroundingDINO
%cd grounded-sam-osx
!python -m pip install openmim
!mim install mmcv-full
!python -m pip install -r requirements.txt
%cd transformer_utils
!python setup.py install
%cd ../../../..
!python -m pip install timm transformers fairscale pycocoevalcap scipy
!python -m pip install pycocotools onnxruntime onnx


### Imports

In [None]:
%cd scripts

In [None]:
import sys

sys.path.append("..")

import gc
import glob
import json
import os

import cv2
import PIL.Image
from IPython.display import Video

from src.utils.image_wrapper import *

### ControlVideo

In [None]:
def save_video(fps, images, scale, video_path, fourcc):
    images = [image_wrapper(image).scale(scale).to_cv2() for image in images]
    video = cv2.VideoWriter(video_path, fourcc, fps, images[0].size)
    for image in images:
        video.write(image)
    video.release()
    cv2.destroyAllWindows()


def display_video(video_path):
    Video.from_file(video_path, embed=True)

In [None]:
controlvideo_conf = """
{
    "paths": {
        "out_path": "../data/dataset/nvdiffrec/original/",
        "cache_dir": "../data/checkpoints/",
        "ifnet_path": "../data/checkpoints/flownet.pkl",
        "condition_path": "../data/dataset/conditioning/",
        "textual_inversion_path": "../data/embeddings/"
    },
    "repositories": {
        "sd": "rossiyareich/aniflatmixAnimeFlatColorStyle_v20-fp16",
        "vae": "rossiyareich/anything-v4.0-vae"
    },
    "controlnet": {
        "scales": [1.0, 0.85, 0.85],
        "exp": 0.85,
        "pipe": {
            "openpose_full": "lllyasviel/control_v11p_sd15_openpose",
            "depth": "lllyasviel/control_v11f1p_sd15_depth",
            "normals": "lllyasviel/control_v11p_sd15_normalbae"
        }
    },
    "video": {
        "num_inference_steps": 20,
        "guidance_scale": 8.0,
        "smooth_steps": [14, 15],
        "seed": null,
        "same_frame_noise": false,
        "length": 80,
        "keyframes": {
            "frames": [0, 9, 19, 29, 39, 49, 59, 69, 79],
            "prompt": "(masterpiece, best quality, character sheet)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
            "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
        },
        "clips": [
            {
                "attn_frames": [0, 9],
                "clip_frames": [1, 2, 3, 4, 5, 6, 7, 8],
                "prompt": "(masterpiece, best quality, character sheet, front view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [9, 19],
                "clip_frames": [10, 11, 12, 13, 14, 15, 16, 17, 18],
                "prompt": "(masterpiece, best quality, character sheet, front view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [19, 29],
                "clip_frames": [20, 21, 22, 23, 24, 25, 26, 27, 28],
                "prompt": "(masterpiece, best quality, character sheet, side view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [29, 39],
                "clip_frames": [30, 31, 32, 33, 34, 35, 36, 37, 38],
                "prompt": "(masterpiece, best quality, character sheet, side view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [39, 49],
                "clip_frames": [40, 41, 42, 43, 44, 45, 46, 47, 48],
                "prompt": "(masterpiece, best quality, character sheet, back view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [49, 59],
                "clip_frames": [50, 51, 52, 53, 54, 55, 56, 57, 58],
                "prompt": "(masterpiece, best quality, character sheet, back view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [59, 69],
                "clip_frames": [60, 61, 62, 63, 64, 65, 66, 67, 68],
                "prompt": "(masterpiece, best quality, character sheet, side view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [69, 79],
                "clip_frames": [70, 71, 72, 73, 74, 75, 76, 77, 78],
                "prompt": "(masterpiece, best quality, character sheet, side view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            }
        ]
    }
}
"""

realesrgan_conf = """
{
    "paths": {
        "in_path": "../data/dataset/nvdiffrec/original/",
        "out_path": "../data/dataset/nvdiffrec/upscaled/"
    },
    "upscale": {
        "outscale": 4.0,
        "tile": 192,
        "tile_pad": 10,
        "pre_pad": 10,
        "face_enhance": true,
        "fp32": false,
        "gpu_id": 0
    }
}
"""

with open("inference_controlvideo.json", "w") as f:
    f.write(controlvideo_conf)
with open("inference_realesrgan.json", "w") as f:
    f.write(realesrgan_conf)
controlvideo_conf = json.loads(controlvideo_conf)
realesrgan_conf = json.loads(realesrgan_conf)

In [None]:
!python inference_controlvideo.py --settings_path "inference_controlvideo.json" 

In [None]:
!python inference_realesrgan.py --settings_path "inference_realesrgan.json"

In [None]:
# Load images, save & display video
images = sorted(glob.glob(os.path.join(realesrgan_conf["paths"]["out_path"], "*.png")))
images = [PIL.Image.open(image) for image in images]
save_video(
    10.0, images, 0.5, "../ipynb/controlvideo_0.5x.mp4", cv2.VideoWriter_fourcc(*"MP4V")
)

del images
gc.collect()

display_video("../ipynb/controlvideo_0.5x.mp4")

### GroundingDINO + SegmentAnything

In [None]:
groundedsam_conf = """
{
    "paths": {
        "in_path": "../data/dataset/nvdiffrec/upscaled/",
        "out_path": "../data/dataset/nvdiffrec/train/",
        "cache_dir": "../data/checkpoints/"
    },
    "groundedsam": {
        "device": "cuda",
        "scale": 0.5,
        "length": 80,
        "clips": [
            {
                "clip_frames": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            },
            {
                "clip_frames": [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            },
            {
                "clip_frames": [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            },
            {
                "clip_frames": [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            },
            {
                "clip_frames": [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            },
            {
                "clip_frames": [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            },
            {
                "clip_frames": [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            },
            {
                "clip_frames": [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            }
        ]
    }
}
"""

with open("inference_groundedsam.json", "w") as f:
    f.write(groundedsam_conf)
groundedsam_conf = json.loads(groundedsam_conf)

In [None]:
!python inference_groundedsam.py --settings_file inference_groundedsam.json

In [None]:
# Load images
images = sorted(glob.glob(os.path.join(realesrgan_conf["paths"]["out_path"], "*.png")))
images = [PIL.Image.open(image) for image in images]

# Load masked images
masked_images = sorted(
    glob.glob(
        os.path.join(
            groundedsam_conf["paths"]["out_path"],
            f"{groundedsam_conf['paths']['file_prefix']}*.png",
        )
    )
)
masked_images = [PIL.Image.open(masked) for masked in masked_images]

# Create image strips
image_strips = []
for i, image in enumerate(images):
    image = image_wrapper(image, "pil")
    masked_image = image_wrapper(masked_images[i], "pil")
    image.concatenate(masked_image)
    image_strips.append(image.to_pil())

save_video(
    10.0,
    image_strips,
    0.5,
    "../ipynb/groundedsam_0.5x.mp4",
    cv2.VideoWriter_fourcc(*"MP4V"),
)

del images
del masked_images
del image_strips
gc.collect()

display_video("../ipynb/groundedsam_0.5x.mp4")