In [1]:
import argparse
import datetime
import logging
import os
import random
import time
from pathlib import Path

import numpy as np
import torch
import torch.backends.cudnn as cudnn

from timm.models import create_model
from timm.optim import create_optimizer
from timm.utils import NativeScaler
from torch.utils.tensorboard import SummaryWriter

from torch.utils.data import RandomSampler
from pytorch_metric_learning.samplers import MPerClassSampler
from pytorch_metric_learning.distances import CosineSimilarity
from pytorch_metric_learning.losses import ContrastiveLoss

In [2]:
from datasets.custom import TuplesDataset
from datasets.genericdataset import ImagesFromList
import torch

In [3]:
model = 'deit_small_distilled_patch16_224'
drop = 0
drop_path= 0.1

In [4]:
model = create_model(
        model,
        pretrained=True,
        num_classes=0,
        drop_rate=drop,
        drop_path_rate=drop_path,
        drop_block_rate=None,
    )

In [5]:
train_dataset = TuplesDataset(
        data_root=r"C:\Users\scl\Documents\AI\Vision\Image Retrieval\data\Stanford_Online_Products",
        mode='train',
        imsize=224,
        nnum=5,
        qsize=2000,
        poolsize=20000
    )

In [6]:
model.cuda()

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0, inplace=False)
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop2): Dropout(p=0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNo

In [7]:
criterion = ContrastiveLoss(
        pos_margin=1,
        neg_margin=0.7,
        distance=CosineSimilarity(),
    )

In [8]:
from main import get_args_parser

In [9]:
def collate_tuples(batch):
    if len(batch) == 1:
        return [batch[0][0]], [batch[0][1]]
    return [batch[i][0] for i in range(len(batch))], [batch[i][1] for i in range(len(batch))]

In [29]:
train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=5, shuffle=True,
        num_workers=0, pin_memory=True, sampler=None,
        drop_last=True, collate_fn=collate_tuples
    )

In [11]:
avg_neg_distance = train_loader.dataset.create_epoch_tuples(model)

>>>> used network: 
>> Extracting descriptors for query images...
>>>> 2000/2000 done...
>> Extracting descriptors for negative pool...
>>>> 20000/20000 done...
>> Searching for hard negatives...
>>>> Average negative l2-distance: 33.78
>>>> Done


In [12]:
model.train()
#model.apply(set_batchnorm_eval)

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0, inplace=False)
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0, inplace=False)
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop2): Dropout(p=0, inplace=False)
      )
    )
    (1): Block(
      (norm1): LayerNo

In [83]:
model.num_features

384

In [27]:
train_dataset.qidxs

[77075,
 89413,
 86877,
 41053,
 63381,
 97773,
 59474,
 30227,
 37384,
 94054,
 52117,
 48893,
 98483,
 85000,
 56522,
 70332,
 62969,
 76757,
 26148,
 98978,
 23801,
 42942,
 84731,
 103631,
 19050,
 33861,
 73610,
 76029,
 83888,
 61551,
 35472,
 105149,
 79270,
 63017,
 101287,
 20279,
 65981,
 70127,
 4162,
 49576,
 8619,
 100397,
 40502,
 3261,
 34748,
 17429,
 95650,
 58441,
 53061,
 12517,
 33104,
 81083,
 10531,
 10711,
 49810,
 11630,
 72019,
 56278,
 4197,
 13462,
 39303,
 24610,
 87200,
 4318,
 15911,
 85751,
 15820,
 15300,
 99065,
 9708,
 89587,
 78840,
 105118,
 45617,
 50043,
 56162,
 15668,
 19045,
 87964,
 103893,
 78703,
 38223,
 63962,
 22775,
 13590,
 63834,
 82711,
 46348,
 106680,
 72276,
 59376,
 66339,
 32911,
 28892,
 14066,
 24474,
 55075,
 16232,
 72695,
 20456,
 98392,
 73784,
 53284,
 15072,
 57782,
 72627,
 98255,
 71322,
 1333,
 96928,
 92328,
 34223,
 96252,
 78663,
 6062,
 36343,
 33947,
 57054,
 12789,
 31045,
 5548,
 47360,
 104324,
 56322,
 87504,
 

In [28]:
num workers=0
for i in train_loader:
    print(i)

SyntaxError: invalid syntax (827628693.py, line 1)

In [31]:
for i, (input, target) in enumerate(train_loader):
    # measure data loading time
    #data_time.update(time.time() - end)


    nq = len(input) # number of training tuples
    ni = len(input[0]) # number of images per tuple
    print('hi')

hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi
hi


KeyboardInterrupt: 

In [34]:
len(input)

5

In [35]:
nq = len(input) # number of training tuples
ni = len(input[0]) # number of images per tuple

In [37]:
ni,nq

(7, 5)

In [52]:
import torch.nn as nn
def contrastive_loss(x, label, margin=0.7, eps=1e-6):
    # x is D x N
    dim = x.size(0) # D
    nq = torch.sum(label.data==-1) # number of tuples
    S = x.size(1) // nq # number of images per tuple including query: 1+1+n

    x1 = x[:, ::S].permute(1,0).repeat(1,S-1).view((S-1)*nq,dim).permute(1,0)
    idx = [i for i in range(len(label)) if label.data[i] != -1]
    x2 = x[:, idx]
    lbl = label[label!=-1]

    dif = x1 - x2
    D = torch.pow(dif+eps, 2).sum(dim=0).sqrt()

    y = 0.5*lbl*torch.pow(D,2) + 0.5*(1-lbl)*torch.pow(torch.clamp(margin-D, min=0),2)
    y = torch.sum(y)
    return y

class ContrastiveLoss(nn.Module):
    r"""CONTRASTIVELOSS layer that computes contrastive loss for a batch of images:
        Q query tuples, each packed in the form of (q,p,n1,..nN)

    Args:
        x: tuples arranges in columns as [q,p,n1,nN, ... ]
        label: -1 for query, 1 for corresponding positive, 0 for corresponding negative
        margin: contrastive loss margin. Default: 0.7

    >>> contrastive_loss = ContrastiveLoss(margin=0.7)
    >>> input = torch.randn(128, 35, requires_grad=True)
    >>> label = torch.Tensor([-1, 1, 0, 0, 0, 0, 0] * 5)
    >>> output = contrastive_loss(input, label)
    >>> output.backward()
    """

    def __init__(self, margin=0.7, eps=1e-6):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin
        self.eps = eps

    def forward(self, x, label):
        return contrastive_loss(x, label, margin=self.margin, eps=self.eps)

    def __repr__(self):
        return self.__class__.__name__ + '(' + 'margin=' + '{:.4f}'.format(self.margin) + ')'

In [53]:
criterion = ContrastiveLoss(margin=.7).cuda()

In [40]:
for q in range(nq):
    output = torch.zeros(384, ni).cuda()
    for imi in range(ni):
        # compute output vector for image imi
        output[:, imi] = model(input[q][imi].cuda())[0]

In [44]:
target[q].cuda()

tensor([-1.,  1.,  0.,  0.,  0.,  0.,  0.], device='cuda:0')

In [56]:
dim = output.size(0) # D
nq = torch.sum(target[q].cuda().data==-1) # number of tuples
dim,nq

(384, tensor(1, device='cuda:0'))

In [57]:
S = output.size(1) // nq # number of images per tuple including query: 1+1+n
S

tensor(7, device='cuda:0')

In [59]:
x1 = output[:, ::S].permute(1,0).repeat(1,S-1).view((S-1)*nq,dim).permute(1,0)
x1.shape

torch.Size([384, 6])

In [64]:
idx = [i for i in range(len(target[q].cuda())) if target[q].cuda().data[i] != -1]
idx
x2 = output[:, idx]
x2.shape

torch.Size([384, 6])

In [65]:
lbl = target[q].cuda()[target[q].cuda()!=-1]
lbl

tensor([1., 0., 0., 0., 0., 0.], device='cuda:0')

In [66]:
dif = x1 - x2

In [67]:
dif

tensor([[-1.8516, -5.0933, -4.1675, -2.2044, -0.2931, -3.0903],
        [ 0.3798,  1.6376,  2.5188,  1.7745,  0.9183,  0.4143],
        [-1.3690,  0.1948, -1.0262, -0.6256, -1.9431, -4.4646],
        ...,
        [-0.3437, -0.1261, -1.9117, -0.8146,  1.4371,  1.4417],
        [-2.9694,  0.6450, -1.7402, -1.3156, -1.8560, -0.7339],
        [ 1.3170, -0.9119,  1.0309,  3.2296,  1.8558,  3.9030]],
       device='cuda:0', grad_fn=<SubBackward0>)

In [68]:
D = torch.pow(dif+eps, 2).sum(dim=0).sqrt()

NameError: name 'eps' is not defined

In [None]:
S = x.size(1) // nq # number of images per tuple including query: 1+1+n

x1 = x[:, ::S].permute(1,0).repeat(1,S-1).view((S-1)*nq,dim).permute(1,0)
idx = [i for i in range(len(label)) if label.data[i] != -1]
x2 = x[:, idx]
lbl = label[label!=-1]

dif = x1 - x2
D = torch.pow(dif+eps, 2).sum(dim=0).sqrt()

y = 0.5*lbl*torch.pow(D,2) + 0.5*(1-lbl)*torch.pow(torch.clamp(margin-D, min=0),2)
y = torch.sum(y)

In [75]:
input[4]

[tensor([[[[ 0.3481,  0.3652,  0.3309,  ..., -1.2788, -1.2788, -1.2959],
           [ 0.3994,  0.3823,  0.3481,  ..., -1.0219, -1.0904, -1.1589],
           [ 0.4166,  0.3481,  0.3481,  ..., -0.8678, -0.9020, -0.9705],
           ...,
           [ 0.3481,  0.1768,  0.1939,  ...,  0.9988,  1.0331,  0.9817],
           [ 0.7419,  0.7248,  0.6392,  ...,  0.9988,  1.0673,  1.0159],
           [ 0.5022,  0.5364,  0.4508,  ...,  1.0502,  1.0502,  0.9988]],
 
          [[-0.0574, -0.0049, -0.0224,  ..., -1.3179, -1.3179, -1.3529],
           [ 0.0126, -0.0049, -0.0049,  ..., -1.0903, -1.1779, -1.2479],
           [ 0.0301, -0.0049,  0.0126,  ..., -0.9153, -0.9853, -1.0203],
           ...,
           [ 0.5553,  0.3803,  0.3978,  ...,  1.0630,  1.0980,  1.0455],
           [ 0.8704,  0.8354,  0.7479,  ...,  1.0630,  1.1331,  1.0805],
           [ 0.4853,  0.5378,  0.4153,  ...,  1.1155,  1.1155,  1.0630]],
 
          [[-0.4798, -0.4450, -0.4798,  ..., -1.4384, -1.4210, -1.4384],
           [-

In [46]:
output.shape

torch.Size([384, 7])

In [54]:
loss = criterion(output, target[q].cuda())

In [55]:
loss

tensor(534.3400, device='cuda:0', grad_fn=<SumBackward0>)

In [None]:
 for q in range(nq):
        output = torch.zeros(model.meta['outputdim'], ni).cuda()
        for imi in range(ni):

            # compute output vector for image imi
            output[:, imi] = model(input[q][imi].cuda()).squeeze()

        # reducing memory consumption:
        # compute loss for this query tuple only
        # then, do backward pass for one tuple only
        # each backward pass gradients will be accumulated
        # the optimization step is performed for the full batch later

In [15]:
target

NameError: name 'target' is not defined

In [12]:
idxs2qpool = torch.randperm(len(train_dataset.qpool))[:train_dataset.qsize]
qidxs = [train_dataset.qpool[i] for i in idxs2qpool]
pidxs = [train_dataset.ppool[i] for i in idxs2qpool]

In [20]:
print_freq = 10

In [13]:
idxs2images = torch.randperm(len(train_dataset.images))[:train_dataset.poolsize]

In [39]:
# no gradients computed, to reduce memory and increase speed
with torch.no_grad():

    print('>> Extracting descriptors for query images...')
    # prepare query loader
    loader = torch.utils.data.DataLoader(
        ImagesFromList(root='', images=[train_dataset.images[i] for i in qidxs], imsize=train_dataset.imsize, transform=train_dataset.transform),
        batch_size=1, shuffle=False, num_workers=8, pin_memory=True
    )
    
    qvecs = torch.zeros(384, len(qidxs)).cuda()
    for i, input in enumerate(loader):
        qvecs[:, i] = model(input.cuda())[0]
        if (i+1) % print_freq == 0 or (i+1) == len(qidxs):
            print('\r>>>> {}/{} done...'.format(i+1, len(qidxs)), end='')
    print('')

>> Extracting descriptors for query images...
>>>> 2000/2000 done...


In [40]:
with torch.no_grad():  
    # prepare negative pool data loader
    loader = torch.utils.data.DataLoader(
        ImagesFromList(root='', images=[train_dataset.images[i] for i in idxs2images], imsize=train_dataset.imsize, transform=train_dataset.transform),
        batch_size=1, shuffle=False, num_workers=8, pin_memory=True
    )
    # extract negative pool vectors
    poolvecs = torch.zeros(384, len(idxs2images)).cuda()
    for i, input in enumerate(loader):
        poolvecs[:, i] = model(input.cuda())[0]
        if (i+1) % print_freq == 0 or (i+1) == len(idxs2images):
            print('\r>>>> {}/{} done...'.format(i+1, len(idxs2images)), end='')
    print('')

>>>> 20000/20000 done...


In [41]:
scores = torch.mm(poolvecs.t(), qvecs)
scores, ranks = torch.sort(scores, dim=0, descending=True)
avg_ndist = torch.tensor(0).float().cuda()  # for statistics
n_ndist = torch.tensor(0).float().cuda()  # f or statistics

In [45]:
n_ndist

tensor(0., device='cuda:0')

In [46]:
nidxs = []
for q in range(len(qidxs)):
    # do not use query cluster,
    # those images are potentially positive
    qcluster = train_dataset.cat_ids[qidxs[q]]
    clusters = [qcluster]
    nidxs = []
    r = 0
    while len(nidxs) < train_dataset.nnum:
        potential = idxs2images[ranks[r, q]]
        # take at most one image from the same cluster
        if not train_dataset.cat_ids[potential] in clusters:
            nidxs.append(potential)
            clusters.append(train_dataset.cat_ids[potential])
            avg_ndist += torch.pow(qvecs[:,q]-poolvecs[:,ranks[r, q]]+1e-6, 2).sum(dim=0).sqrt()
            n_ndist += 1
        r += 1
    nidxs.append(nidxs)

In [48]:
len(nidxs)

6

In [51]:
avg_ndist

tensor(340402.8438, device='cuda:0')

In [50]:
(avg_ndist/n_ndist).item()

34.040283203125

In [49]:
nidxs

[tensor(56896),
 tensor(105003),
 tensor(13459),
 tensor(31224),
 tensor(6572),
 [...]]

In [None]:
# selection of negative examples
self.nidxs = []
for q in range(len(self.qidxs)):
    # do not use query cluster,
    # those images are potentially positive
    qcluster = self.clusters[self.qidxs[q]]
    clusters = [qcluster]
    nidxs = []
    r = 0
    while len(nidxs) < self.nnum:
        potential = idxs2images[ranks[r, q]]
        # take at most one image from the same cluster
        if not self.clusters[potential] in clusters:
            nidxs.append(potential)
            clusters.append(self.clusters[potential])
            avg_ndist += torch.pow(qvecs[:,q]-poolvecs[:,ranks[r, q]]+1e-6, 2).sum(dim=0).sqrt()
            n_ndist += 1
        r += 1

In [22]:
x = model(input.cuda())

In [31]:
input.shape

torch.Size([1, 3, 224, 224])

In [38]:
x[0].shape

torch.Size([1, 384])

In [30]:
x[1].shape

torch.Size([1, 384])

In [27]:
ImagesFromList

datasets.genericdataset.ImagesFromList

In [29]:
len(train_dataset.images)

108027

In [28]:
for i in qidxs:
    if i > len(train_dataset.images):
        print(i)

115539
117070
109538
111261
116886
110338
118118
115210
116564
116949
110661
117656
115383
111437
108097
111752
114793
114639
118384
115741
112308
116638
113378
117951
113495
115626
114773
116429
110721
116517
117240
108173
112018
117508
115137
113322
115967
118426
114462
110965
110888
117424
110559
114893
117374
111179
110355
109650
111155
113171
110071
119317
115738
119133
109937
117473
118889
114714
116158
117537
119104
110349
113608
109161
117838
119478
112228
115009
118157
112793
115401
116930
110644
110313
110673
118003
109964
110123
118478
113951
112446
119675
115562
110978
108204
109249
118843
115415
117926
119244
114554
110223
110804
117228
114126
113506
109881
108926
113690
118636
119594
109074
112008
116905
108613
111007
114478
108484
119349
115678
109918
108387
117387
108088
110851
110102
118261
117673
114426
118465
110625
117289
117705
117200
110431
119277
115953
112738
112459
109217
108868
117147
112240
118783
119068
112835
117164
115103
110228
112741
113335
109329
111813