In [3]:
import random
import numpy as np
from tqdm import tqdm
from collections import OrderedDict

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import datasets.mvtec as mvtec


def hook(module, input, output):
    outputs.append(output)

def embedding_concat(x, y):
    B, C1, H1, W1 = x.size()
    _, C2, H2, W2 = y.size()
    s = int(H1 / H2)
    x = F.unfold(x, kernel_size=s, dilation=1, stride=s)
    x = x.view(B, C1, -1, H2, W2)
    z = torch.zeros(B, C1 + C2, x.size(2), H2, W2)
    for i in range(x.size(2)):
        z[:, :, i, :, :] = torch.cat((x[:, :, i, :, :], y), 1)
    z = z.view(B, -1, H2 * W2)
    z = F.fold(z, kernel_size=s, output_size=(H1, W1), stride=s)

    return z


def embedding_concat_1(x, y):
    B, C1, H1 = x.size()
    _, C2, H2 = y.size()
    s = int(H1 / H2)
    x = F.unfold(x, kernel_size=s, dilation=1, stride=s)
    x = x.view(B, C1, -1, H2)
    z = torch.zeros(B, C1 + C2, x.size(2), H2)
    for i in range(x.size(2)):
        z[:, :, i, :] = torch.cat((x[:, :, i, :], y), 1)
    z = z.view(B, -1, H2)
    z = F.fold(z, kernel_size=s, output_size=(H), stride=s)

    return z


use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

model = vit_b_16(pretrained=True)

model.to(device)
model.eval()
random.seed(1024)
torch.manual_seed(1024)
torch.cuda.manual_seed_all(1024)

# set model's intermediate outputs
outputs = []

model.encoder.layers.encoder_layer_0.register_forward_hook(hook)
model.encoder.layers.encoder_layer_1.register_forward_hook(hook)
model.encoder.layers.encoder_layer_2.register_forward_hook(hook)

# model.features.register_forward_hook(hook)

for class_name in mvtec.CLASS_NAMES:
    train_dataset = mvtec.MVTecDataset('C:/Users/Mikhail/Documents/Study/diploma/mvtec', class_name=class_name, is_train=True)
    train_dataloader = DataLoader(train_dataset, batch_size=32, pin_memory=True)

    train_outputs = OrderedDict([('layer1', []), ('layer2', []), ('layer3', [])])
    # train_outputs = OrderedDict([('layer1', [])])

    # extract train set features
    for (x, _, _) in tqdm(train_dataloader, '| feature extraction | train | %s |' % class_name):
        # model prediction
        with torch.no_grad():
            _ = model(x.to(device))
        # get intermediate layer outputs
        for k, v in zip(train_outputs.keys(), outputs):
            train_outputs[k].append(v.cpu().detach())
        # initialize hook outputs
        outputs = []

    for k, v in train_outputs.items():
        train_outputs[k] = torch.cat(v, 0)

    # Embedding concat
    embedding_vectors = train_outputs['layer1']
    for layer_name in ['layer2', 'layer3']:
        embedding_vectors = embedding_concat(embedding_vectors, train_outputs[layer_name])
        
    break

| feature extraction | train | bottle |: 100%|███████████████████████████████████████████| 7/7 [00:11<00:00,  1.65s/it]


ValueError: not enough values to unpack (expected 4, got 3)

In [None]:
embedding_vectors.shape

In [None]:
train_outputs['layer1'].shape, train_outputs['layer2'].shape, train_outputs['layer3'].shape

In [None]:
from pretrainedmodels.models import (
    se_resnet50,
    se_resnet101,
    se_resnext50_32x4d,
    se_resnext101_32x4d,
    senet154,
    xception
)

model = se_resnext50_32x4d(num_classes=1000, pretrained='imagenet')

In [1]:
from torchvision.models import (
    vit_b_16
)

model = vit_b_16(pretrained=True)

In [2]:
model

VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (linear_1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU()
          (dropout_1): Dropout(p=0.0, inplace=False)
          (linear_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout_2): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
 

In [12]:
model

SqueezeNet(
  (features): Sequential(
    (0): Conv2d(3, 96, kernel_size=(7, 7), stride=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
    (3): Fire(
      (squeeze): Conv2d(96, 16, kernel_size=(1, 1), stride=(1, 1))
      (squeeze_activation): ReLU(inplace=True)
      (expand1x1): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
      (expand1x1_activation): ReLU(inplace=True)
      (expand3x3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (expand3x3_activation): ReLU(inplace=True)
    )
    (4): Fire(
      (squeeze): Conv2d(128, 16, kernel_size=(1, 1), stride=(1, 1))
      (squeeze_activation): ReLU(inplace=True)
      (expand1x1): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1))
      (expand1x1_activation): ReLU(inplace=True)
      (expand3x3): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (expand3x3_activation): ReLU(inplace=True)
    )
    (5): Fire(
   

In [116]:
model.encoder.layers.encoder_layer_0, model.encoder.layers.encoder_layer_1, model.encoder.layers.encoder_layer_2

(EncoderBlock(
   (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
   (self_attention): MultiheadAttention(
     (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
   )
   (dropout): Dropout(p=0.0, inplace=False)
   (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
   (mlp): MLPBlock(
     (linear_1): Linear(in_features=768, out_features=3072, bias=True)
     (act): GELU()
     (dropout_1): Dropout(p=0.0, inplace=False)
     (linear_2): Linear(in_features=3072, out_features=768, bias=True)
     (dropout_2): Dropout(p=0.0, inplace=False)
   )
 ),
 EncoderBlock(
   (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
   (self_attention): MultiheadAttention(
     (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
   )
   (dropout): Dropout(p=0.0, inplace=False)
   (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
   (mlp): MLPBlock(
     (linear_1): Linear(in_feature

In [10]:
model.layer1[-1], model.layer2[-1], model.layer3[-1]

(Bottleneck(
   (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
   (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
   (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
   (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (relu): ReLU(inplace=True)
 ),
 Bottleneck(
   (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
   (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
   (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
   