### Setup

Run `setup.sh` (once) beforehand

In [None]:
# Setup torch and other prerequisites
%cd ../
!python -m pip install --upgrade pip
!python -m pip install gdown ipykernel ipywidgets
!python -m pip install torch torchvision torchaudio torchtext torchdata --index-url https://download.pytorch.org/whl/cu118

# Update submodules
!git submodule update --init --recursive

# Create directories
!mkdir data/dataset/nerf/original/
!mkdir data/dataset/nerf/train/

# Install requirements
!python -m pip install -r requirements.txt

# Setup RealESRGAN
%cd ext/Real-ESRGAN/
!python -m pip install -r requirements.txt
!python setup.py develop
%cd ../../

# Setup SegmentAnything, GroundingDINO
%cd ext/Grounded-Segment-Anything/
!python -m pip install -e segment_anything
!python -m pip install -e GroundingDINO
%cd grounded-sam-osx
!python -m pip install openmim
!mim install mmcv-full
!python -m pip install -r requirements.txt
%cd transformer_utils
!python setup.py install
%cd ../../../../
!python -m pip install timm transformers fairscale pycocoevalcap scipy
!python -m pip install pycocotools onnxruntime onnx

# Setup InstantNGP
!apt install libgoogle-glog-dev \
    libgflags-dev \
    libatlas-base-dev \
    libeigen3-dev \
    libsuitesparse-dev \
    libboost-program-options-dev \
    libboost-filesystem-dev \
    libboost-graph-dev \
    libboost-system-dev \
    libboost-test-dev \
    libfreeimage-dev \
    libmetis-dev \
    libglew-dev \
    qtbase5-dev \
    libqt5opengl5-dev \
    libcgal-dev
!pip install commentjson
%cd data/
!wget https://github.com/camenduru/instant-ngp-colab/releases/download/v1.0/ceres-solver-v2.zip
!wget https://github.com/camenduru/instant-ngp-colab/releases/download/v1.0/instant-ngp-v2.zip
!unzip ceres-solver-v2.zip -d ceres-solver
!unzip instant-ngp-v2.zip -d instant-ngp
!rm ceres-solver-v2.zip
!rm instant-ngp-v2.zip
!cp -r ceres-solver/lib/. /usr/local/lib
!chmod 755 ceres-solver/bin/colmap
!cp -r ceres-solver/bin/. /usr/local/bin
%cd ../

### Imports

In [None]:
%cd scripts/

In [None]:
import sys

sys.path.append("..")

import gc
import glob
import json
import os

import cv2
import PIL.Image
from IPython.display import Video

from src.utils.image_wrapper import *

### ControlVideo

In [None]:
def save_video(fps, images, scale, video_path, fourcc):
    images = [image_wrapper(image).scale(scale).to_cv2() for image in images]
    video = cv2.VideoWriter(video_path, fourcc, fps, images[0].size)
    for image in images:
        video.write(image)
    video.release()
    cv2.destroyAllWindows()


def display_video(video_path):
    Video.from_file(video_path, embed=True)

In [None]:
controlvideo_conf = """
{
    "paths": {
        "out_path": "../data/dataset/nerf/original/",
        "cache_dir": "../data/checkpoints/",
        "ifnet_path": "../data/checkpoints/flownet.pkl",
        "condition_path": "../data/dataset/conditioning/",
        "textual_inversion_path": "../data/embeddings/"
    },
    "repositories": {
        "sd": "rossiyareich/aniflatmixAnimeFlatColorStyle_v20-fp16",
        "vae": "rossiyareich/anything-v4.0-vae"
    },
    "controlnet": {
        "scales": [1.0, 0.85, 0.85, 0.70],
        "exp": 0.85,
        "pipe": {
            "openpose_full": "lllyasviel/control_v11p_sd15_openpose",
            "depth": "lllyasviel/control_v11f1p_sd15_depth",
            "normals": "lllyasviel/control_v11p_sd15_normalbae",
            "lineart": "lllyasviel/control_v11p_sd15_lineart"
        }
    },
    "video": {
        "num_inference_steps": 20,
        "guidance_scale": 8.0,
        "smooth_steps": [14, 15],
        "seed": null,
        "same_frame_noise": false,
        "length": 64,
        "keyframes": {
            "frames": [0, 7, 15, 23, 31, 39, 47, 55, 63],
            "prompt": "(masterpiece, best quality)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
            "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
        },
        "clips": [
            {
                "attn_frames": [0, 7],
                "clip_frames": [1, 2, 3, 4, 5, 6],
                "prompt": "(masterpiece, best quality, anime screencap, front view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [7, 15],
                "clip_frames": [8, 9, 10, 11, 12, 13, 14],
                "prompt": "(masterpiece, best quality, anime screencap, side view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [15, 23],
                "clip_frames": [16, 17, 18, 19, 20, 21, 22],
                "prompt": "(masterpiece, best quality, anime screencap, side view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [23, 31],
                "clip_frames": [24, 25, 26, 27, 28, 29, 30],
                "prompt": "(masterpiece, best quality, anime screencap, back view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [31, 39],
                "clip_frames": [32, 33, 34, 35, 36, 37, 38],
                "prompt": "(masterpiece, best quality, anime screencap, top view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [39, 47],
                "clip_frames": [40, 41, 42, 43, 44, 45, 46],
                "prompt": "(masterpiece, best quality, anime screencap, bottom view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [47, 55],
                "clip_frames": [48, 49, 50, 51, 52, 53, 54],
                "prompt": "(masterpiece, best quality, anime screencap, front view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            },
            {
                "attn_frames": [55, 63],
                "clip_frames": [56, 57, 58, 59, 60, 61, 62],
                "prompt": "(masterpiece, best quality, anime screencap, back view)+, 1girl, white hoodie, earmuffs, leggings, white scarf, black gloves, white socks, short blue hair, blue eyes, bangs",
                "negative_prompt": "easynegative, badhandv4, verybadimagenegative_v1.3, (worst quality, low quality, logo, text, watermark, username, nsfw), inaccurate hands and fingers"
            }
        ]
    }
}
"""

realesrgan_conf = """
{
    "paths": {
        "in_path": "../data/dataset/nerf/original/",
        "out_path": "../data/dataset/nerf/train/"
    },
    "upscale": {
        "outscale": 4.0,
        "tile": 192,
        "tile_pad": 10,
        "pre_pad": 10,
        "face_enhance": true,
        "fp32": false,
        "gpu_id": 0
    }
}
"""

with open("inference_controlvideo.json", "w") as f:
    f.write(controlvideo_conf)
with open("inference_realesrgan.json", "w") as f:
    f.write(realesrgan_conf)
controlvideo_conf = json.loads(controlvideo_conf)
realesrgan_conf = json.loads(realesrgan_conf)

In [None]:
!python inference_controlvideo.py --settings_path "inference_controlvideo.json" 

In [None]:
!python inference_realesrgan.py --settings_path "inference_realesrgan.json"

In [None]:
# Load images, save & display video
images = sorted(glob.glob(os.path.join(realesrgan_conf["paths"]["out_path"], "*.png")))
images = [PIL.Image.open(image) for image in images]
save_video(
    8.0, images, 0.5, "../ipynb/controlvideo_0.5x.mp4", cv2.VideoWriter_fourcc(*"MP4V")
)

del images
gc.collect()

display_video("../ipynb/controlvideo_0.5x.mp4")

### GroundingDINO + SegmentAnything

In [None]:
groundedsam_conf = """
{
    "paths": {
        "in_path": "../data/dataset/nerf/train/",
        "out_path": "../data/dataset/nerf/train/",
        "cache_dir": "../data/checkpoints/",
        "file_prefix": "dynamic_mask_"
    },
    "groundedsam": {
        "device": "cuda",
        "scale": 0.5,
        "length": 64,
        "clips": [
            {
                "clip_frames": [0, 1, 2, 3, 4, 5, 6],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            },
            {
                "clip_frames": [7, 8, 9, 10, 11, 12, 13, 14],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            },
            {
                "clip_frames": [15, 16, 17, 18, 19, 20, 21, 22],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            },
            {
                "clip_frames": [23, 24, 25, 26, 27, 28, 29, 30],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            },
            {
                "clip_frames": [31, 32, 33, 34, 35, 36, 37, 38],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            },
            {
                "clip_frames": [39, 40, 41, 42, 43, 44, 45, 46],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            },
            {
                "clip_frames": [47, 48, 49, 50, 51, 52, 53, 54],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            },
            {
                "clip_frames": [55, 56, 57, 58, 59, 60, 61, 62, 63],
                "det_prompt": "girl",
                "box_threshold": 0.3,
                "text_threshold": 0.25,
                "merge_masks": true
            }
        ]
    }
}
"""

with open("inference_groundedsam.json", "w") as f:
    f.write(groundedsam_conf)
groundedsam_conf = json.loads(groundedsam_conf)

In [None]:
!python inference_groundedsam.py --settings_file inference_groundedsam.json

In [None]:
# Load images
images = sorted(glob.glob(os.path.join(realesrgan_conf["paths"]["out_path"], "*.png")))
images = [PIL.Image.open(image) for image in images]

# Load masks
masks = sorted(
    glob.glob(
        os.path.join(
            groundedsam_conf["paths"]["out_path"],
            f"{groundedsam_conf['paths']['file_prefix']}*.png",
        )
    )
)
masks = [PIL.Image.open(mask) for mask in masks]

# Create image strips
image_strips = []
for i, image in enumerate(images):
    image = image_wrapper(image, "pil")
    mask = image_wrapper(masks[i], "pil")
    masked_image = image.to_np() * mask.to_np()
    masked_image = image_wrapper(masked_image, "np")
    image.concatenate(mask).concatenate(masked_image)
    image = image.to_pil()
    image_strips.append(image)

save_video(
    8.0,
    image_strips,
    0.5,
    "../ipynb/groundedsam_0.5x.mp4",
    cv2.VideoWriter_fourcc(*"MP4V"),
)

del images
del masks
del image_strips
gc.collect()

display_video("../ipynb/groundedsam_0.5x.mp4")

### InstantNGP

##### Training

In [None]:
%cd ../data/instant-ngp/

In [None]:
training_data = "../dataset/nerf/"
save_snapshot = "../dataset/nerf/instant-ngp.msgpack"
n_steps = 5000
sharpen = 1.0

In [None]:
!python scripts/run.py --training_data {training_data} --mode nerf --save_snapshot {save_snapshot} --n_steps {n_steps} --sharpen {sharpen}

##### Rendering camera path

In [None]:
video_camera_path = "../dataset/nerf/base_cam.json"
video_fps = 8
video_n_seconds = 8
video_spp = 16
video_output = "../../ipynb/instant-ngp.mp4"
width, height = 2304, 2304

In [None]:
!python scripts/run.py --mode nerf --load_snapshot {save_snapshot} --video_camera_path {video_camera_path} --video_fps {video_fps} --video_n_seconds {video_n_seconds} --video_spp {video_spp} --video_output {video_output} --width {width} --height {height} --sharpen {sharpen}

In [None]:
display_video(video_output)

##### Mesh extraction

In [None]:
save_mesh = "../dataset/nerf/instant-ngp.obj"
marching_cubes_res = 1024

In [None]:
!python scripts/run.py --mode nerf --load_snapshot {save_snapshot} --save_mesh {save_mesh} --marching_cubes_res {marching_cubes_res} --sharpen {sharpen}