In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! wget http://www.robots.ox.ac.uk/~vgg/software/vgg_face/src/vgg_face_torch.tar.gz
! tar -xvf vgg_face_torch.tar.gz
! pip install torchfile
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

In [None]:
import torch
from torchvision import transforms
import numpy as np
from PIL import Image
import torch.nn as nn
import torch.nn.functional as F
import torchfile
from PIL import Image
from scipy import signal

In [None]:
def kernel(n):
    x, y = np.meshgrid(np.linspace(-1, 1, n), np.linspace(-1, 1, n))
    dst = np.sqrt(x*x + y*y)
    sigma = 1
    muu = 0.000
    return np.exp(-((dst-muu) ** 2 / ( 2.0 * sigma**2)))


def downsample(filters, layer):
    ds = [56, 28]
    result = np.zeros((filters.shape[1], ds[layer], ds[layer]))
    for filter in range(filters.shape[1]):
        result[filter] = signal.convolve(filters[0, filter], kernel(2), mode="same")[::2, ::2]
    return result[None]

# Faces (StyleGAN3)

### VGG16 for face recognition

In [None]:
class VGG_16(nn.Module):
    def __init__(self):
        super().__init__()
        self.block_size = [2, 2, 3, 3, 3]
        self.conv_1_1 = nn.Conv2d(3, 64, 3, stride=1, padding=1)
        self.conv_1_2 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.conv_2_1 = nn.Conv2d(64, 128, 3, stride=1, padding=1)
        self.conv_2_2 = nn.Conv2d(128, 128, 3, stride=1, padding=1)
        self.conv_3_1 = nn.Conv2d(128, 256, 3, stride=1, padding=1)
        self.conv_3_2 = nn.Conv2d(256, 256, 3, stride=1, padding=1)
        self.conv_3_3 = nn.Conv2d(256, 256, 3, stride=1, padding=1)
        self.conv_4_1 = nn.Conv2d(256, 512, 3, stride=1, padding=1)
        self.conv_4_2 = nn.Conv2d(512, 512, 3, stride=1, padding=1)
        self.conv_4_3 = nn.Conv2d(512, 512, 3, stride=1, padding=1)
        self.conv_5_1 = nn.Conv2d(512, 512, 3, stride=1, padding=1)
        self.conv_5_2 = nn.Conv2d(512, 512, 3, stride=1, padding=1)
        self.conv_5_3 = nn.Conv2d(512, 512, 3, stride=1, padding=1)
        self.fc6 = nn.Linear(512 * 7 * 7, 4096)
        self.fc7 = nn.Linear(4096, 4096)
        self.fc8 = nn.Linear(4096, 2622)

    def load_weights(self, path="/content/vgg_face_torch/VGG_FACE.t7"):
        model = torchfile.load(path)
        counter = 1
        block = 1
        for i, layer in enumerate(model.modules):
            if layer.weight is not None:
                if block <= 5:
                    self_layer = getattr(self, "conv_%d_%d" % (block, counter))
                    counter += 1
                    if counter > self.block_size[block - 1]:
                        counter = 1
                        block += 1
                    self_layer.weight.data[...] = torch.tensor(layer.weight).view_as(self_layer.weight)[...]
                    self_layer.bias.data[...] = torch.tensor(layer.bias).view_as(self_layer.bias)[...]
                else:
                    self_layer = getattr(self, "fc%d" % (block))
                    block += 1
                    self_layer.weight.data[...] = torch.tensor(layer.weight).view_as(self_layer.weight)[...]
                    self_layer.bias.data[...] = torch.tensor(layer.bias).view_as(self_layer.bias)[...]

    def forward(self, x, layer):
        """ Pytorch forward
        Args:
            x: input image (224x224)
        Returns: class logits
        """
        x = F.relu(self.conv_1_1(x))
        x = F.relu(self.conv_1_2(x))
        x = F.max_pool2d(x, 2, 2)
        if layer == 1:
            return x
        x = F.relu(self.conv_2_1(x))
        x = F.relu(self.conv_2_2(x))
        x = F.max_pool2d(x, 2, 2)
        if layer == 2:
            return x
        x = F.relu(self.conv_3_1(x))
        x = F.relu(self.conv_3_2(x))
        x = F.relu(self.conv_3_3(x))
        x = F.max_pool2d(x, 2, 2)
        if layer == 3:
            return x
        x = F.relu(self.conv_4_1(x))
        x = F.relu(self.conv_4_2(x))
        x = F.relu(self.conv_4_3(x))
        x = F.max_pool2d(x, 2, 2)
        if layer == 4:
            return x
        x = F.relu(self.conv_5_1(x))
        x = F.relu(self.conv_5_2(x))
        x = F.relu(self.conv_5_3(x))
        x = F.max_pool2d(x, 2, 2)
        if layer == 5:
            return x
        # x = x.view(x.size(0), -1)
        # x = F.relu(self.fc6(x))
        # x = F.dropout(x, 0.5, self.training)
        # if layer == 6:
        #     return x
        # x = F.relu(self.fc7(x))
        # x = F.dropout(x, 0.5, self.training)
        # if layer == 7:
        #     return x
        # return self.fc8(x)

In [None]:
class ModelFace:
    def __init__(self):
        self.model = VGG_16().double()
        self.model.load_weights()
        self.model.eval().to('cuda')
        self.ds = [56, 28]
        
    def preprocess(self, folder, split, n, start=0):
        images = torch.zeros((split, 3, 224, 224))
        for index, i in enumerate(range(split * n + start, split * (n + 1) + start)):
            x = Image.open("/content/drive/My Drive/faces/%s/%s.png" % (folder, str(i+1).zfill(4)))
            x = np.asarray(x.resize((224, 224), resample=Image.LANCZOS)).astype("float32")
            x = torch.Tensor(x).permute(2, 0, 1).view(1, 3, 224, 224).double()
            x -= torch.Tensor(np.array([93.5940, 104.7624, 129.1863])).double().view(1, 3, 1, 1)
            images[index] = x
        return images.to('cuda')

    def get_features(self, layer, folder, split, n, start=0):
        xs = self.preprocess(folder, split, n, start)
        with torch.no_grad():
            _out = self.model(xs.double(), layer).detach().cpu().numpy()
            if index < 2:
                ds = np.zeros((split, _out.shape[1], self.ds[layer-1], self.ds[layer-1]))
                for i in range(_out.shape[0]):
                    ds[i] = downsample(_out[i][None], layer-1).squeeze()
                _out = ds
            return _out

In [None]:
# VGG16 for face recognition
vgg_face = ModelFace()
layers = np.arange(1, 6)
split = 50

# test set features
vgg_t1 = np.zeros((100, 64, 56, 56))
vgg_t2 = np.zeros((100, 128, 56, 56))
vgg_t3 = np.zeros((100, 256, 28, 28))
vgg_t4 = np.zeros((100, 512, 14, 14))
vgg_t5 = np.zeros((100, 512, 7, 7))
vgg_faces = [vgg_t1, vgg_t2, vgg_t3, vgg_t4, vgg_t5]
for i, layer in enumerate(layers):
    for n in range(2):
        vgg_faces[i][n*split:(n+1)*split] = vgg_face.get_features(layer, "test", split, n)
    np.save("/content/drive/My Drive/faces/vggface_te_%i.npy" % i, vgg_faces[i])

# training set features
vgg_t1 = np.zeros((4000, 64, 56, 56))  # ds: 112 -> 56
vgg_t2 = np.zeros((4000, 128, 56, 56)) # ds: 56 -> 28
vgg_t3 = np.zeros((4000, 256, 28, 28))
vgg_t4 = np.zeros((4000, 512, 14, 14))
vgg_t5 = np.zeros((4000, 512, 7, 7))
vgg_faces = [vgg_t1, vgg_t2, vgg_t3, vgg_t4, vgg_t5]
for i, layer in enumerate(layers):
    for n in range(80):
        vgg_faces[i][n*split:(n+1)*split] = vgg_face.get_features(layer, "training", split, n, 100)
    np.save("/content/drive/My Drive/faces/vggface_tr_%i.npy" % i, vgg_faces[i])

### CLIP (ViT-L/14@336px)

In [None]:
model, preprocess = clip.load("ViT-L/14@336px")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

# test set
images = []
for i in range(100):
    image = Image.open("/content/drive/My Drive/faces/test/%s.png" % str(i+1).zfill(4))
    images.append(preprocess(image))
image_input = torch.tensor(np.stack(images)).cuda()
with torch.no_grad():
    image_features = model.encode_image(image_input).float()
np.save("/content/drive/My Drive/faces/clip_te.npy", image_features.cpu().detach().numpy())

# training set
images = []
for i in range(4000): 
    image = Image.open("/content/drive/My Drive/faces/training/%s.png" % str(i+101).zfill(4))
    images.append(preprocess(image))
image_input = torch.tensor(np.stack(images)).cuda()
with torch.no_grad():
    image_features = model.encode_image(image_input).float()
np.save("/content/drive/My Drive/faces/clip_tr.npy", image_features.cpu().detach().numpy())

# StyleGAN-XL (natural images)

### VGG16 for object recognition

In [None]:
class Model:
    def __init__(self, model, weights):
        self.model = torch.hub.load('pytorch/vision:v0.10.0', model, weights=weights).eval().to('cuda')
        self.preprocess = transforms.Compose([
            transforms.Resize(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

    def get_features(self, index, layer, folder, split, n, start=0):
        output = []
        for i in range(start + split * n, start + split * (n + 1)):
            input_image = Image.open("/content/drive/My Drive/images/%s/%s.png" % (folder, str(i+1).zfill(4)))
            input_tensor = self.preprocess(input_image).unsqueeze(0).to('cuda')
            _out = self.model.features[:layer](input_tensor).detach().cpu().numpy()
            if index < 2:
                _out = downsample(_out, index)
            output.append(_out)
        return np.array(output).squeeze()

In [None]:
# VGG16 for object recognition
vgg = Model("vgg16", "VGG16_Weights.DEFAULT")
layers = [5, 10, 17, 24, 31]
split = 50

# test set features
vgg_t1 = np.zeros((200, 64, 56, 56))  # ds: 112 -> 56
vgg_t2 = np.zeros((200, 128, 28, 28)) # ds: 56 -> 28
vgg_t3 = np.zeros((200, 256, 28, 28))
vgg_t4 = np.zeros((200, 512, 14, 14))
vgg_t5 = np.zeros((200, 512, 7, 7))
vgg = [vgg_t1, vgg_t2, vgg_t3, vgg_t4, vgg_t5]
for i, layer in enumerate(layers):
    for n in range(4):
        vgg[i][n*split:(n+1)*split] = vgg.get_features(i, layer, "test", split=split, n=n)
    np.save("/content/drive/My Drive/images/put/vgg_te_%i.npy" % i, vgg[i])

# training set features
vgg_t1 = np.zeros((4000, 64, 56, 56))
vgg_t2 = np.zeros((4000, 128, 28, 28))
vgg_t3 = np.zeros((4000, 256, 28, 28))
vgg_t4 = np.zeros((4000, 512, 14, 14))
vgg_t5 = np.zeros((4000, 512, 7, 7))
vgg = [vgg_t1, vgg_t2, vgg_t3, vgg_t4, vgg_t5]
for i, layer in enumerate(layers):
    for n in range(80):
        vgg[i][n*split:(n+1)*split] = vgg.get_features(i, layer, "training", split=split, n=n, start=100)
    np.save("/content/drive/My Drive/images/vgg_tr_%i.npy" % i, vgg[i])

### CLIP (ViT-L/14@336px)

In [None]:
model, preprocess = clip.load("ViT-L/14@336px")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

# test set
images = []
for i in range(200):
    image = Image.open("/content/drive/My Drive/images/test/%s.png" % str(i+1).zfill(4))
    images.append(preprocess(image))
image_input = torch.tensor(np.stack(images)).cuda()
with torch.no_grad():
    image_features = model.encode_image(image_input).float()
np.save("/content/drive/My Drive/images/clip_te.npy", image_features.cpu().detach().numpy())

# training set
images = []
for i in range(4000): 
    image = Image.open("/content/drive/My Drive/images/training/%s.png" % str(i+101).zfill(4))
    images.append(preprocess(image))
image_input = torch.tensor(np.stack(images)).cuda()
with torch.no_grad():
    image_features = model.encode_image(image_input).float()
np.save("/content/drive/My Drive/images/clip_tr.npy", image_features.cpu().detach().numpy())