In [1]:
from pathlib import Path
from cub_dataset import CUBDataset
from image_transform import get_image_transform, get_inv_image_transform
import torch
image_root = Path("C:\\Users\\Matej\\Documents\\CUB\\images")
captions_root = Path("C:\\Users\\Matej\\Documents\\CUB\\captions")
embeddings_root = Path("C:\\Users\\Matej\\Documents\\CUB\\embeddings")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

l1_coef = 50
l2_coef=100

test_set = CUBDataset(
        image_root=image_root,
        embeddings_root=embeddings_root,
        image_transform=get_image_transform(64),
        device=device,
        seed=1234,
        split=(0.8, 0.0, 0.2),
        subset="train",
    )

  from .autonotebook import tqdm as notebook_tqdm
Loading train set: 200it [00:00, 2113.06it/s]

Using 9430 samples for train subset





In [2]:
from torch import nn
from DCGAN import Generator, Discriminator

generator = Generator()
discriminator = Discriminator()

best_model_path = "C:\\Users\\Matej\\Documents\\seminar2\\trained_models\\train1-for-testing\\saved_models\\ep200.tar"

checkpoint = torch.load(best_model_path, map_location=device)
generator.load_state_dict(checkpoint['generator'])
discriminator.load_state_dict(checkpoint['discriminator'])

generator.eval()
discriminator.eval()

criterion = nn.BCELoss()
l2_loss = nn.MSELoss()
l1_loss = nn.L1Loss()

generator.to(device)
discriminator.to(device)
criterion.to(device)

def get_loss(images, embeddings):
    noise = torch.randn(1, 100).to(device)
    fake_images = generator(embeddings, noise)
    out_fake, act_fake = discriminator(fake_images, embeddings)
    out_real, act_real = discriminator(images, embeddings)
    g_bce = criterion(out_fake, torch.full_like(out_fake, 1))
    g_l1 = l1_coef * l1_loss(fake_images, images)
    g_l2 = l2_coef * l2_loss(torch.mean(act_fake, 0), torch.mean(act_real, 0).detach())
    g_loss = g_bce + g_l1 + g_l2
    return g_loss, fake_images

In [3]:
from PIL import Image

def combine_images_64_to_128(image1, image2):
    # Resize both images to 128x128
    image1 = image1.resize((128, 128), Image.Resampling.LANCZOS)
    image2 = image2.resize((128, 128), Image.Resampling.LANCZOS)

    # Create a new image with combined width and common height
    combined = Image.new("RGB", (image1.width + image2.width, 128))

    # Paste images side by side
    combined.paste(image1, (0, 0))
    combined.paste(image2, (image1.width, 0))

    return combined


In [4]:
from tqdm import tqdm

output_path = Path("C:\\Users\\Matej\\Documents\\seminar2\\trained_models\\best-model-dump")

losses = {

}

for i, sample_path in tqdm(enumerate(test_set.sample_paths)):
    # print(sample_path)
    image_path = image_root / f"{sample_path}.jpg"
    embeddings_path = embeddings_root / f"{sample_path}.pt"
    captions_path = captions_root / f"{sample_path}.txt"

    image = Image.open(image_path).convert('RGB')
    image = get_image_transform()(image)
    image = image.unsqueeze(0)
    image = image.to(device)

    embedding = torch.load(embeddings_path, map_location=device)
    embedding = embedding.unsqueeze(0)
    embedding = embedding.to(device)

    with torch.no_grad():
        loss, gen_img_tensor = get_loss(image, embedding)
        losses[i] = loss.item()

    with open(captions_path, "r") as f:
        caption = f.read()

    save_path = output_path / str(i)
    save_path.mkdir(parents=True, exist_ok=True)

    real_image = get_inv_image_transform()(image[0])
    fake_image = get_inv_image_transform()(gen_img_tensor[0])

    combined_image = combine_images_64_to_128(real_image, fake_image)
    combined_image.save(str(save_path / f"{Path(sample_path).name}.png"))

    #real_image.save(str(save_path / f"{Path(sample_path).name}_real.png"))
    #fake_image.save(str(save_path / f"{Path(sample_path).name}_fake.png"))
    with open(str(save_path / "caption.txt"), "w") as f:
        f.write(caption + f"\nloss: {loss.item()}")



9430it [11:01, 14.25it/s]


In [5]:
items = losses.items()
n_items = len(items)
items = sorted(items, key=lambda x: x[1], reverse=False)
print(items[0:10])

[(4923, 9.635774612426758), (149, 9.672496795654297), (6147, 10.73388385772705), (7064, 10.751594543457031), (6092, 11.36527156829834), (5061, 11.496672630310059), (5905, 11.753509521484375), (9374, 11.839312553405762), (6532, 11.86898422241211), (6676, 11.886252403259277)]


In [6]:
print(items[-9::])

[(4736, 295.9927673339844), (6168, 301.3152770996094), (3209, 313.6275634765625), (8780, 339.16912841796875), (7468, 345.36590576171875), (4090, 397.13848876953125), (5333, 409.37713623046875), (3143, 452.7203063964844), (4674, 459.7781982421875)]


In [3]:
from DCGAN import Generator, Discriminator
print(Generator())

Generator(
  (projection): Sequential(
    (0): Linear(in_features=768, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.2, inplace=True)
  )
  (main): Sequential(
    (0): ConvTranspose2d(228, 512, kernel_size=(4, 4), stride=(1, 1), bias=False)
    (1): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): ConvTranspose2d(512, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (4): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): ConvTranspose2d(256, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (7): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU(inplace=True)
    (9): ConvTranspose2d(128, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
 

In [4]:
print(Discriminator())

Discriminator(
  (main): Sequential(
    (0): Conv2d(3, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (1): LeakyReLU(negative_slope=0.2, inplace=True)
    (2): Conv2d(64, 128, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): LeakyReLU(negative_slope=0.2, inplace=True)
    (5): Conv2d(128, 256, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (6): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): LeakyReLU(negative_slope=0.2, inplace=True)
    (8): Conv2d(256, 512, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1), bias=False)
    (9): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): LeakyReLU(negative_slope=0.2, inplace=True)
  )
  (projection): Sequential(
    (0): Linear(in_features=768, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-0