In [7]:
import sys
import logging
import numpy as np
import pickle as pkl
import cv2
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt

from random import randint
from torch.utils.data import Dataset, DataLoader
from ADE20K.utils.utils_ade20k import loadAde20K

sys.path.append('/home/jacob/work/ControlNet')

class ADE20KDataset(Dataset):
    def __init__(self, size=(512, 512)):
        self.DATASET_PATH = 'ADE20K/dataset'
        index_file = 'index_ade20k.pkl'
        with open('{}/{}'.format(self.DATASET_PATH, index_file), 'rb') as f:
            self.index_ade20k = pkl.load(f)
        
        self.size = size
        self.default_prompt = "a high-quality, detailed, and professional image"

    def __len__(self):
        return len(self.index_ade20k['filename'])

    def __getitem__(self, idx):
        full_file_name = '{}/{}'.format(self.index_ade20k['folder'][idx], self.index_ade20k['filename'][idx])
        try:
            info = loadAde20K('{}/{}'.format(self.DATASET_PATH, full_file_name))
        except UnicodeDecodeError as e:
            logging.error(f"Error loading image {idx} at {full_file_name}")
            raise e
        target = cv2.imread(info['img_name'])[:,:,::-1]
        source = cv2.imread(info['segm_name'])[:,:,::-1]

        # Do not forget that OpenCV read images in BGR order.
        source = cv2.cvtColor(source, cv2.COLOR_BGR2RGB)
        target = cv2.cvtColor(target, cv2.COLOR_BGR2RGB)

        # Resize
        source = cv2.resize(source, dsize=self.size, interpolation=cv2.INTER_CUBIC)
        target = cv2.resize(target, dsize=self.size, interpolation=cv2.INTER_CUBIC)

        # Normalize source images to [0, 1].
        source = source.astype(np.float32) / 255.0

        # Normalize target images to [-1, 1].
        target = (target.astype(np.float32) / 127.5) - 1.0

        return dict(jpg=target, txt=self.default_prompt, hint=source)

dataset = ADE20KDataset()
dataloader = DataLoader(dataset, num_workers=0, batch_size=1, shuffle=True)

In [4]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from cldm.logger import ImageLogger
from cldm.model import create_model, load_state_dict


# Configs
resume_path = './ControlNet/lightning_logs/version_1/checkpoints/epoch=14-step=103410.ckpt'
batch_size = 2
logger_freq = 300
learning_rate = 1e-5
sd_locked = True
only_mid_control = False


# First use cpu to load models. Pytorch Lightning will automatically move it to GPUs.
model = create_model('./ControlNet/models/cldm_v15.yaml').cpu()
model.load_state_dict(load_state_dict(resume_path, location='cpu'))
model.learning_rate = learning_rate
model.sd_locked = sd_locked
model.only_mid_control = only_mid_control

ControlLDM: Running in eps-prediction mode
DiffusionWrapper has 859.52 M params.
making attention of type 'vanilla' with 512 in_channels
Working with z of shape (1, 4, 32, 32) = 4096 dimensions.
making attention of type 'vanilla' with 512 in_channels


Some weights of the model checkpoint at openai/clip-vit-large-patch14 were not used when initializing CLIPTextModel: ['vision_model.encoder.layers.1.mlp.fc2.weight', 'vision_model.encoder.layers.22.self_attn.out_proj.bias', 'vision_model.encoder.layers.22.layer_norm1.bias', 'vision_model.encoder.layers.9.layer_norm2.bias', 'vision_model.encoder.layers.8.layer_norm1.weight', 'vision_model.encoder.layers.1.self_attn.out_proj.weight', 'vision_model.encoder.layers.6.self_attn.k_proj.bias', 'vision_model.encoder.layers.23.self_attn.q_proj.bias', 'vision_model.encoder.layers.2.mlp.fc2.bias', 'vision_model.encoder.layers.20.layer_norm1.bias', 'vision_model.encoder.layers.5.self_attn.out_proj.weight', 'vision_model.encoder.layers.9.mlp.fc1.weight', 'vision_model.encoder.layers.4.mlp.fc2.bias', 'vision_model.encoder.layers.17.layer_norm1.bias', 'vision_model.encoder.layers.14.mlp.fc1.weight', 'vision_model.encoder.layers.10.self_attn.k_proj.weight', 'vision_model.encoder.layers.9.self_attn.q_pr

Loaded model config from [./ControlNet/models/cldm_v15.yaml]
Loaded state_dict from [./ControlNet/lightning_logs/version_1/checkpoints/epoch=14-step=103410.ckpt]


In [11]:
for i, batch in enumerate(dataloader):
    if i == 10:
        break
    source = batch['hint'] * 255
    source = source.int()
    source = source[..., ::-1]
    target = (batch['jpg'] + 1) * 127.5
    target = target.int()
    target = target[..., ::-1]
    output = model.get_input(batch, 0)
    plt.figure(figsize=(10,5))
    plt.imshow(np.concatenate([source, target, output], 1))
    plt.axis('off')

ValueError: step must be greater than zero

In [18]:
model.get_input(batch, 0)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [23]:
model = model.to('cpu')
model.get_input(batch, 0)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)