In [1]:
import re
import time
import numpy as np
import skimage.io
import json
import random
import PIL
from tqdm import tqdm as tqdm
import sys


import matplotlib.image as mpimg
import matplotlib.pyplot as plt

try:
    %matplotlib inline  
    plt.rcParams['figure.figsize'] = (10, 10)
except:
    pass

import torch
import torch.utils.data
import torchvision

from tensorboardX import SummaryWriter

import sys
sys.path.append('./../cvpr19_refactored_codes')
import text_model
import torch_functions

torch.set_num_threads(3)

In [2]:
import argparse

def parse_opt():
    
    parser = argparse.ArgumentParser()
    
    parser.add_argument('-f', type=str, default='f')
    parser.add_argument('-comment', type=str, default='_test_main_text_extract_combine')
    parser.add_argument('--embed_dim', type=int, default=512)
    parser.add_argument('--loader_num_workers', type=int, default=8)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--optim', type=str, default='adam', help='what update to use? sgd|adam')
    parser.add_argument('--learning_rate', type=float, default=1e-3)
    parser.add_argument('--learning_rate_decay_frequency', type=int, default=50000)
    parser.add_argument('--weight_decay', type=float, default=1e-05)
    parser.add_argument('--momentum', type=float, default=0.9)
    parser.add_argument('--num_epochs', type=int, default=50)
    
    args = parser.parse_args()
    
    return args

opt = parse_opt() 
logger = SummaryWriter(comment = opt.comment)

In [3]:
class NamsBaseDataset(torch.utils.data.Dataset):
    
    def name(self):
        assert(False)
        
    def get_image_path(self, idx):
        assert(False)
    
    def get_image_captions(self, idx):
        assert(False)
        
    def get_loader(self, batch_size, shuffle = False, drop_last = False, num_workers = 0):
        return torch.utils.data.DataLoader(
            self,
            batch_size = batch_size,
            shuffle = shuffle,
            num_workers = num_workers,
            drop_last = drop_last,
            collate_fn = lambda i: i
        )
    
    def normalize_caption(self, text):
        import string
        if type(text) == str:
            text = unicode(text, "utf-8")
        text = text.encode('ascii', 'replace')
        text = str(text).lower().translate(None, string.punctuation).strip()
        return text
    
    def get_all_texts(self):
        texts = []
        for i in range(len(self)):
            for t in self.get_image_captions(i):
                texts.append(t)
        return texts
    
    def precompute_img_features(self, force=False):
        features_filename = self.name() + '_features.npy'
        try:
            assert(not force)
            self.img_features = np.load(features_filename)
            print 'sucessfully loaded features'
            return
        except:
            print 'compute features...'
        self.img_features = None
        
        # run model on all images
        net = torchvision.models.resnet50(pretrained=True)
        net.avgpool = torch.nn.AdaptiveAvgPool2d((1,1))
        net.fc = torch.nn.Dropout()
        net = net.cuda().eval()
        loader = self.get_loader(batch_size=8, shuffle=False, drop_last=False, num_workers=4)
        img_features = np.zeros((len(self), 2048))
        i = 0
        for data in tqdm(loader):
            imgs = torch.stack([d['image'] for d in data])
            x = net(imgs.cuda()).cpu().detach().numpy()
            img_features[i:(i+x.shape[0]),:] = x
            imgs = torch.flip(imgs, [3])
            x = net(imgs.cuda()).cpu().detach().numpy()
            img_features[i:(i+x.shape[0]),:] += x
            i += x.shape[0]
        self.img_features = img_features
        np.save(features_filename, self.img_features)
    
    def __getitem__(self, idx):
        if self.img_features is not None:
            img = self.img_features[idx,:]
        else:
            raw_img = torchvision.datasets.folder.pil_loader(self.get_image_path(idx))
            img = raw_img
            if self.transform:
                img = self.transform(img)
        img = {
            'id': None,
            'label': None,
            'index': idx,
            'image': img,
            'captions': self.get_image_captions(idx)
        }
        return img

# Required files:
# dataset_path/annotations/captions_val2014.json
# dataset_path/annotations/captions_train2014.json
# dataset_path/train2014/[image files]
# dataset_path/val2014/[image files]
class COCOCaptionDataset(NamsBaseDataset):
    
    def __init__(self, dataset_path = '', transform = None, test_split = False):
        self.dataset_path = dataset_path
        self.transform = transform
        self.test_split = test_split
        
        import json
        if test_split:
            x = json.load(open(self.dataset_path + '/annotations/captions_val2014.json', 'rt'))
            img_path = dataset_path + '/val2014/'
        else:
            x = json.load(open(self.dataset_path + '/annotations/captions_train2014.json', 'rt'))
            img_path = dataset_path + '/train2014/'
            
        imgs = []
        id2id = {}
        for img in x['images']:
            id2id[img['id']] = len(imgs)
            imgs += [{
                'id': img['id'],
                'class': img['id'],
                'filename': img_path + img['file_name'],
                'captions': []
            }]

        for cap in x['annotations']:
            imgs[id2id[cap['image_id']]]['captions'] += [self.normalize_caption(cap['caption'])]

        self.imgs = imgs
    
    def __len__(self):
        return len(self.imgs)

    def name(self):
        if self.test_split:
            return "CocoCapTest"
        return "CocoCapTrain"
        
    def get_image_path(self, idx):
        return self.imgs[idx]['filename']
    
    def get_image_captions(self, idx):
        return self.imgs[idx]['captions']
    
    def __getitem__(self, idx):
        item = super(COCOCaptionDataset, self).__getitem__(idx)
        item['label'] = self.imgs[idx]['id']
        return item
        
class SimpleImageCaptions112(NamsBaseDataset):
    
    def __init__(self, dataset_path = '', transform = None):
        self.dataset_path = dataset_path
        self.transform = transform
        imgs = []
        import os
        import os.path
        for d in os.listdir(dataset_path):
          if not os.path.isfile(dataset_path + '/' + d):
            for f in os.listdir(dataset_path + '/' + d):
              if os.path.isfile(dataset_path + '/' + d + '/' + f):
                imgs += [{
                    'id': len(imgs),
                    'captions': [self.normalize_caption(d)],
                    'filename': dataset_path + '/' + d + '/' + f
                }]
        self.imgs = imgs
        self.make_test_queries()
    
    def __len__(self):
        return len(self.imgs)
    
    def name(self):
        return "SimpleImageCaptions112"
        
    def get_image_path(self, idx):
        return self.imgs[idx]['filename']
    
    def get_image_captions(self, idx):
        return self.imgs[idx]['captions']
    
    def __getitem__(self, idx):
        item = super(SimpleImageCaptions112, self).__getitem__(idx)
        item['label'] = self.imgs[idx]['captions'][0]
        return item
    
    def make_test_queries(self):
        
        novel_obj_list = ['trex', 'stormtrooper', 'darthvader', 'chewbacca']
        
        caption2ids = {}
        for i in range(len(self)):
            for caption in self.get_image_captions(i):
                try:
                    caption2ids[caption] += [i]
                except:
                    caption2ids[caption] = []
                    caption2ids[caption] += [i]
        
        test_queries = []
        for cap1 in caption2ids.keys():
            for cap2 in caption2ids.keys():
                cap1s = cap1.replace('on the ', '').replace('in the ', '').replace('living room', 'livingroom').split()
                cap2s = cap2.replace('on the ', '').replace('in the ', '').replace('living room', 'livingroom').split()
                diffs = []
                for w1, w2 in zip(cap1s, cap2s):
                    if w1 != w2:
                        w1 = w1.replace('livingroom', 'living room')
                        w2 = w2.replace('livingroom', 'living room')
                        diffs += [w1, w2]
                if len(diffs) != 2:
                    continue
                if diffs[0] in novel_obj_list or diffs[1] in novel_obj_list:
                    continue
                for idx in caption2ids[cap1]:
                    test_queries += [{
                        'source_idx': idx,
                        'source_caption': cap1,
                        'target_caption': cap2,
                        'replacing_words': diffs
                    }]
        self.test_queries_seen_objects = []
        self.test_queries_novel_objects = []
        for t in test_queries:
            novel_objects = False
            for w in novel_obj_list:
                if w in t['source_caption']:
                    novel_objects = True
            if novel_objects:
                self.test_queries_novel_objects += [t]
            else:
                self.test_queries_seen_objects += [t]
        assert(len(self.test_queries_seen_objects) == 18051)
        assert(len(self.test_queries_novel_objects) == 745)
        
        

    
    def make_test_queriesxxxx(self):
        
        novel_obj_list = ['trex', 'stormtrooper', 'darthvader', 'chewbacca']
        
        random.seed(333)
        test_queries = []
        while len(test_queries) < 500000:
            
            img1 = self.imgs[len(test_queries) % len(self.imgs)]
            cap1 = img1['captions'][0]
            img2 = random.choice(self.imgs)
            cap2 = img2['captions'][0]
            
            cap1 = cap1.replace('on the ', '').replace('in the ', '').replace('living room', 'livingroom')
            cap1 = cap1.split()
            cap2 = cap2.replace('on the ', '').replace('in the ', '').replace('living room', 'livingroom')
            cap2 = cap2.split()

            diffs = []
            for w1, w2 in zip(cap1, cap2):
                if w1 != w2:
                    w1 = w1.replace('livingroom', 'living room')
                    w2 = w2.replace('livingroom', 'living room')
                    diffs += [w1, w2]
            if len(diffs) != 2:
                continue
            if diffs[0] in novel_obj_list or diffs[1] in novel_obj_list:
                continue

            ix = img1['id']
            assert(self.imgs[ix] == img1)
            test_queries += [{
                'ix': ix,
                'source_idx': img1['id'],
                'replacing_words': diffs,
                'source_caption': img1['captions'][0],
                'target_caption': img2['captions'][0]
            }]

        # save
        query_check = {}
        self.test_queries = []
        self.novel_obj_test_queries = []
        for q in test_queries:
            k = str(q['source_idx']) + q['target_caption']
            if k in query_check:
                continue
            query_check[k] = True
            contain_novel_obj = False
            for w in novel_obj_list:
                if w in q['source_caption'] or w in q['target_caption']:
                    contain_novel_obj = True
            if contain_novel_obj:
                self.novel_obj_test_queries += [q]
            else:
                self.test_queries += [q]
        self.test_queries_novel_objects = self.novel_obj_test_queries[:700]
        self.test_queries_seen_objects = self.test_queries[:5000]

In [4]:
dataset_path = '/home/nam/exp/Fall2018/imagetextdatasets/coco'
trainset = COCOCaptionDataset(
    dataset_path,
    transform = torchvision.transforms.Compose([
        torchvision.transforms.RandomResizedCrop(336, scale=(0.8, 1.0), ratio=(0.75, 1.3)),
        #torchvision.transforms.Resize((336,336)),
        torchvision.transforms.RandomHorizontalFlip(),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    test_split = False
)

testset = COCOCaptionDataset(
    dataset_path,
    transform = torchvision.transforms.Compose([
        torchvision.transforms.RandomResizedCrop(336, scale=(0.8, 1.0), ratio=(0.75, 1.3)),
        #torchvision.transforms.Resize((336,336)),
        torchvision.transforms.RandomHorizontalFlip(),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    test_split = True
)

sic112 = SimpleImageCaptions112(
    '../imagetextdatasets/nams_googleimage_dataset/googleimagesdata/downloads12',
    transform = torchvision.transforms.Compose([
        torchvision.transforms.RandomResizedCrop(336, scale=(0.8, 1.0), ratio=(0.75, 1.3)),
        #torchvision.transforms.Resize((336,336)),
        torchvision.transforms.RandomHorizontalFlip(),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
)

In [5]:
print 'Read subjects verbs contexts from cocotrain2014.nams.pth and add to trainset'

x = torch.load('cocotrain2014.nams.pth')

id2id = {}
for i, img in enumerate(trainset.imgs):
    id2id[img['id']] = i
for img in x['images']:
    
    sim = img['subjects_sim']
    if sim is None:
        sim = np.eye(1)
    a = np.sum((sim - np.eye(sim.shape[0])) > 0.7, axis=0)
    a = np.where(a > 0)[0]
    subjects = [img['subjects'][i] for i in a]
    
    sim = img['verbs_sim']
    if sim is None:
        sim = np.eye(1)
    a = np.sum((sim - np.eye(sim.shape[0])) > 0.7, axis=0)
    a = np.where(a > 0)[0]
    verbs = [img['verbs'][i] for i in a]
    
    sim = img['contexts_sim']
    if sim is None:
        sim = np.eye(1)
    a = np.sum((sim - np.eye(sim.shape[0])) > 0.7, axis=0)
    a = np.where(a > 0)[0]
    contexts = [img['contexts'][i] for i in a]
    
    trainset.imgs[id2id[img['id']]]['subjects'] = subjects
    trainset.imgs[id2id[img['id']]]['verbs'] = verbs
    trainset.imgs[id2id[img['id']]]['contexts'] = contexts

Read subjects verbs contexts from cocotrain2014.nams.pth and add to trainset


In [6]:
for img in sic112.imgs:
    words = img['captions'][0].split()
    img['subjects'] = [words[0]]
    if not words[0] in ['trex', 'stormtrooper', 'darthvader', 'chewbacca']:
        img['verbs'] = [words[1]]
        img['contexts'] = [' '.join(words[2:])]
    else:
        img['verbs'] = []
        img['contexts'] = [' '.join(words[1:])]
    

In [7]:
trainset.precompute_img_features(0)
testset.precompute_img_features(0)
sic112.precompute_img_features(0)


sucessfully loaded features
sucessfully loaded features
sucessfully loaded features


In [8]:
# model
model = torch.nn.Module()

# image
model.img_encoder = torchvision.models.resnet50(pretrained=True)
model.img_encoder.fc = torch.nn.Sequential(
    torch.nn.Dropout(0.2),
    torch.nn.Linear(2048, 2048),
    torch.nn.BatchNorm1d(2048),
    torch.nn.Dropout(0.2),
    torch.nn.ReLU(),
    torch.nn.Linear(2048, opt.embed_dim)
)

# text
model.text_encoder = text_model.TextLSTMModel(
    texts_to_build_vocab = trainset.get_all_texts(),
    word_embed_dim = 256,
    lstm_hidden_dim = 512
)
model.text_encoder.fc_output = torch.nn.Sequential(
    torch.nn.Dropout(0.1),
    torch.nn.Linear(opt.embed_dim, 2048),
    torch.nn.BatchNorm1d(2048),
    torch.nn.Dropout(0.1),
    torch.nn.ReLU(),
    torch.nn.Linear(2048, opt.embed_dim)
)

# transform function
class One2OneTransformation(torch.nn.Module):
    def __init__(self):
        super(One2OneTransformation, self).__init__()
        embed_dim = opt.embed_dim
        self.m = torch.nn.Sequential(
            torch.nn.Linear(embed_dim * 1, embed_dim * 2),
            torch.nn.ReLU(),
            torch.nn.Linear(embed_dim * 2, embed_dim * 2),
            torch.nn.BatchNorm1d(embed_dim * 2),
            torch.nn.ReLU(),
            torch.nn.Linear(embed_dim * 2, embed_dim)
        )
        self.norm = torch_functions.NormalizationLayer(learn_scale=False)

    def forward(self, x):
        f = self.norm(x)
        f = self.m(f)
        return f
    
class Three2OneTransformation(torch.nn.Module):
    def __init__(self):
        super(Three2OneTransformation, self).__init__()
        embed_dim = opt.embed_dim
        self.m = torch.nn.Sequential(
            torch.nn.Linear(embed_dim * 3, embed_dim * 5),
            torch.nn.ReLU(),
            torch.nn.Linear(embed_dim * 5, embed_dim * 5),
            torch.nn.BatchNorm1d(embed_dim * 5),
            torch.nn.ReLU(),
            torch.nn.Linear(embed_dim * 5, embed_dim)
        )
        self.norm = torch_functions.NormalizationLayer(learn_scale=False)

    def forward(self, x):
        f = torch.cat([self.norm(i) for i in x], dim=1)
        f = self.m(f)
        return f

model.subject_extractor = One2OneTransformation()
model.verb_extractor = One2OneTransformation()
model.context_extractor = One2OneTransformation()
model.svc_combine = Three2OneTransformation()

model.snorm = torch_functions.NormalizationLayer(normalize_scale=4.0, learn_scale=True)
model = model.cuda()

# loss function
def pair_loss(a, b):
    # force a,b similar in the embedding space
    a = model.snorm(a)
    b = model.snorm(b).transpose(0,1)
    x = torch.mm(a, b)
    if random.random() > 0.5:
        x = x.transpose(0, 1)
    labels = torch.tensor(range(x.shape[0])).long()
    return torch.nn.functional.cross_entropy(x, labels.cuda())


In [9]:
# create optimizer
params = []
params.append({'params': [p for p in model.img_encoder.fc.parameters()]})
params.append({'params': [p for p in model.img_encoder.parameters()], 'lr': 0.1 * opt.learning_rate})
params.append({'params': [p for p in model.text_encoder.parameters()], 'lr': opt.learning_rate})
params.append({'params': [p for p in model.parameters()]})

# remove dup params (keep the first one)
for i1, p1 in enumerate(params):
  for i2, p2 in enumerate(params):
    if p1 is not p2:
      for p11 in p1['params']:
        for j, p22 in enumerate(p2['params']):
          if p11 is p22:
            p2['params'][j] = torch.tensor(0.0, requires_grad=True)
        
optimizer = torch.optim.SGD(
    params,
    lr=opt.learning_rate,
    momentum=opt.momentum,
    weight_decay=opt.weight_decay
)
if opt.optim == 'adam':
    optimizer = torch.optim.Adam(
        params,
        lr=opt.learning_rate,
        weight_decay=opt.weight_decay
    )


In [10]:
def test(testset):
    r = test_text_to_image_retrieval(testset)
    if testset == sic112:
        r += test_svc(testset)
    return r

def test_text_to_image_retrieval(testset):
    model.eval()
    img_features = []
    img_labels = []
    text_features = []
    text_labels = []
    for data in testset.get_loader(batch_size = opt.batch_size, shuffle = True, drop_last= False):
        # extract image features
        imgs = np.stack([d['image'] for d in data])
        imgs = torch.from_numpy(imgs).float()
        if len(imgs.shape) == 2:
            imgs = model.img_encoder.fc(imgs.cuda())
        else:
            imgs = model.img_encoder(imgs.cuda())
        imgs = model.snorm(imgs).cpu().detach().numpy()
        img_features += [imgs]
        img_labels += [d['label'] for d in data]

        # text
        texts = []
        for d in data:
            texts += d['captions']
            text_labels += [d['label'] for c in d['captions']]
        texts = model.text_encoder(texts)
        texts = model.snorm(texts).cpu().detach().numpy()
        text_features += [texts]

        if len(img_labels) > 1100:
            break

    img_features = np.concatenate(img_features, axis=0)
    text_features = np.concatenate(text_features, axis=0)

    
    # text to image
    sims = text_features.dot(img_features.T)
    r1 = 0.0
    for i in range(sims.shape[0]):
        s = -sims[i,:]
        s = np.argsort(s)
        if text_labels[i] == img_labels[s[0]]:
            r1 += 1
    r1 /= sims.shape[0]
    return [('text2image_recall_top1', r1)]


def test_svc(sic112):
    model.eval()

    # all img features
    img_features = []
    for data in sic112.get_loader(batch_size = opt.batch_size, shuffle = False, drop_last= False):
        # extract image features
        imgs = np.stack([d['image'] for d in data])
        imgs = torch.from_numpy(imgs).float()
        if len(imgs.shape) == 2:
            imgs = model.img_encoder.fc(imgs.cuda())
        else:
            imgs = model.img_encoder(imgs.cuda())
        imgs = model.snorm(imgs).cpu().detach().numpy()
        img_features += [imgs]

    img_features = np.concatenate(img_features, axis=0)
    img_labels = [img['captions'][0] for img in sic112.imgs]
    
    # construct random queries
    queries = []
    np.random.seed(123)
    while len(queries) < 10000:
        i = np.random.randint(0, len(sic112.imgs))
        j = np.random.randint(0, len(sic112.imgs))
        k = np.random.randint(0, len(sic112.imgs))
        if len(sic112.imgs[i]['verbs'] + sic112.imgs[j]['verbs'] + sic112.imgs[k]['verbs']) < 3:
            continue
        queries += [{
            'subject_img_id': i,
            'verb_img_id': j,
            'context_img_id': k,
            'subject': sic112.imgs[i]['subjects'][0],
            'verb': sic112.imgs[j]['verbs'][0],
            'context': sic112.imgs[k]['contexts'][0],
            'label': sic112.imgs[i]['subjects'][0] + ' ' + sic112.imgs[j]['verbs'][0] + ' ' + sic112.imgs[k]['contexts'][0]
        }]
        
    #----
    #----
    r = []
    for s in ['t', 'i']:
      for v in ['t', 'i']:
        for c in ['t', 'i']:
            # compute query features
            query_features = []
            query_labels = []
            for i in range(0, len(queries), opt.batch_size):
                if s == 'i':
                    subjects = model.subject_extractor(torch.from_numpy(
                        img_features[[q['subject_img_id'] for q in queries[i:(i+opt.batch_size)]],:]
                    ).cuda())
                else:
                    subjects = model.text_encoder([q['subject'] for q in queries[i:(i+opt.batch_size)]])
                if v == 'i':
                    verbs = model.subject_extractor(torch.from_numpy(
                        img_features[[q['verb_img_id'] for q in queries[i:(i+opt.batch_size)]],:]
                    ).cuda())
                else:
                    verbs = model.text_encoder([q['verb'] for q in queries[i:(i+opt.batch_size)]])
                if c == 'i':
                    contexts = model.subject_extractor(torch.from_numpy(
                        img_features[[q['context_img_id'] for q in queries[i:(i+opt.batch_size)]],:]
                    ).cuda())
                else:
                    contexts = model.text_encoder([q['context'] for q in queries[i:(i+opt.batch_size)]])
                svc = model.svc_combine([subjects, verbs, contexts])
                svc = svc.cpu().detach().numpy()
                query_features += [svc]
                query_labels += [q['label'] for q in queries[i:(i+opt.batch_size)]]

            query_features = np.concatenate(query_features, axis=0)

            # compute recall
            def measure_retrieval_performance(query_features, name = 'X'):
                sims = query_features.dot(img_features.T)
                sims = sims
                for k in [1, 5, 10]:
                    r1 = 0.0
                    for i in range(sims.shape[0]):
                        s = -sims[i,:]
                        s = np.argsort(s)
                        if query_labels[i] in [img_labels[s[j]] for j in range(k)]:
                        #if query_labels[i] == img_labels[s[0]]:
                            r1 += 1
                    r1 /= sims.shape[0]
                    r.append(('svc_' + name + '_recall_top' + str(k), r1))
                return r
            measure_retrieval_performance(query_features, name = s + v + c)
    return r

In [11]:
losses_tracking = {}
it = 0
epoch = 0
tic = time.time()

In [12]:
while True:

    ##########################################
    print 'It', it, 'epoch', epoch, 'Elapsed time', round(time.time() - tic, 4)
    tic = time.time()
    epoch += 1
    for loss_name in losses_tracking:
        avg_loss = np.mean(losses_tracking[loss_name][-999:])
        print '   ', loss_name, round(avg_loss, 4)
        logger.add_scalar(loss_name, avg_loss, it)
        
    if True:
        tests = []
        for dataset in [trainset, testset, sic112]:
            t = test(dataset)
            tests += [(dataset.name() + ' ' + metric_name, metric_value) for metric_name, metric_value in t]

        for metric_name, metric_value in tests:
            print ' ', metric_name, round(metric_value, 4)
            logger.add_scalar(metric_name, metric_value, epoch)
    if epoch >= opt.num_epochs:
        break

    ##########################################
    model.train()
    loader = trainset.get_loader(
        batch_size=opt.batch_size, shuffle=True,
        drop_last=True, num_workers=opt.loader_num_workers)
    for data in tqdm(loader):
        it += 1
        losses = []

        # learing rate scheduling
        if it >= opt.learning_rate_decay_frequency and it % opt.learning_rate_decay_frequency == 0:
            for g in optimizer.param_groups:
                g['lr'] *= 0.1

        # joint embedding
        imgs = np.stack([d['image'] for d in data])
        imgs = torch.from_numpy(imgs).float()
        if len(imgs.shape) == 2:
            imgs = model.img_encoder.fc(imgs.cuda())
        else:
            imgs = model.img_encoder(imgs.cuda())
        texts = [random.choice(d['captions']) for d in data]
        texts = model.text_encoder(texts)
        loss_name = 'joint_embedding'
        loss_weight = 1.0
        loss_value = pair_loss(texts, imgs).cuda()
        losses += [(loss_name, loss_weight, loss_value)]
        
        
        if epoch >= 2:
            
            def extractor_loss(from_image = False):
                subjects = [trainset.imgs[d['index']]['subjects'] for d in data]
                verbs = [trainset.imgs[d['index']]['verbs'] for d in data]
                contexts = [trainset.imgs[d['index']]['contexts'] for d in data]

                if from_image:
                    a = torch.cat([imgs[i:(i+1),:] for i in range(texts.shape[0]) if len(subjects[i]) > 0])
                    b = torch.cat([imgs[i:(i+1),:] for i in range(texts.shape[0]) if len(verbs[i]) > 0])
                    c = torch.cat([imgs[i:(i+1),:] for i in range(texts.shape[0]) if len(contexts[i]) > 0])
                else:
                    a = torch.cat([texts[i:(i+1),:] for i in range(texts.shape[0]) if len(subjects[i]) > 0])
                    b = torch.cat([texts[i:(i+1),:] for i in range(texts.shape[0]) if len(verbs[i]) > 0])
                    c = torch.cat([texts[i:(i+1),:] for i in range(texts.shape[0]) if len(contexts[i]) > 0])

                extracted_subjects = model.subject_extractor(a)
                extracted_verbs = model.verb_extractor(b)
                extracted_contexts = model.context_extractor(c)

                subjects = [np.random.choice(i) for i in subjects if len(i) > 0]
                verbs = [np.random.choice(i) for i in verbs if len(i) > 0]
                contexts = [np.random.choice(i) for i in contexts if len(i) > 0]

                encoded_subjects = model.text_encoder([str(i) for i in subjects])
                encoded_verbs = model.text_encoder([str(i) for i in verbs])
                encoded_contexts = model.text_encoder([str(i) for i in contexts])

                return pair_loss(
                    torch.cat([extracted_subjects, extracted_verbs, extracted_contexts]),
                    torch.cat([encoded_subjects, encoded_verbs, encoded_contexts])
                )
            
            def combine_loss(img_target = True):
                subjects = [trainset.imgs[d['index']]['subjects'] for d in data]
                verbs = [trainset.imgs[d['index']]['verbs'] for d in data]
                contexts = [trainset.imgs[d['index']]['contexts'] for d in data]
                iii = [i for i, d in enumerate(data) if len(trainset.imgs[d['index']]['verbs']) > 0 and 
                                                       len(trainset.imgs[d['index']]['contexts']) > 0 and 
                                                       len(trainset.imgs[d['index']]['subjects']) > 0 ]

                subjects = [subjects[i] for i in iii]
                verbs = [verbs[i] for i in iii]
                contexts = [contexts[i] for i in iii]
                
                subjects = [np.random.choice(i) for i in subjects]
                verbs = [np.random.choice(i) for i in verbs]
                contexts = [np.random.choice(i) for i in contexts]
                if img_target:
                    svc_img = imgs[iii,:]
                else:
                    svc_img =  model.text_encoder([' '.join([s, v, c]) for s,v,c in zip(subjects, verbs, contexts)])
                
                subjects = model.text_encoder([str(i) for i in subjects])
                verbs = model.text_encoder([str(i) for i in verbs])
                contexts = model.text_encoder([str(i) for i in contexts])
                
                svc_combine = model.svc_combine([subjects, verbs, contexts])
                return pair_loss(svc_img, svc_combine)
            
            def extract_combine_loss(reverse = False):
                subjects = [trainset.imgs[d['index']]['subjects'] for d in data]
                verbs = [trainset.imgs[d['index']]['verbs'] for d in data]
                contexts = [trainset.imgs[d['index']]['contexts'] for d in data]

                a = torch.cat([imgs[i:(i+1),:] for i in range(texts.shape[0]) if len(subjects[i]) > 0])
                b = torch.cat([imgs[i:(i+1),:] for i in range(texts.shape[0]) if len(verbs[i]) > 0])
                c = torch.cat([imgs[i:(i+1),:] for i in range(texts.shape[0]) if len(contexts[i]) > 0])

                extracted_subjects = model.subject_extractor(a)
                extracted_verbs = model.verb_extractor(b)
                extracted_contexts = model.context_extractor(c)

                subjects = [np.random.choice(i) for i in subjects if len(i) > 0]
                verbs = [np.random.choice(i) for i in verbs if len(i) > 0]
                contexts = [np.random.choice(i) for i in contexts if len(i) > 0]
                
                n = min(len(subjects), len(verbs), len(contexts))
                iii = np.random.permutation(n)
                extracted_subjects = extracted_subjects[iii,:]
                extracted_verbs = extracted_verbs[iii,:]
                extracted_contexts = extracted_contexts[iii,:]
                subjects = [subjects[i] for i in iii]
                verbs = [verbs[i] for i in iii]
                contexts = [contexts[i] for i in iii]
                
                if reverse:
                    svc =  model.text_encoder([' '.join([s, v, c]) for s,v,c in zip(subjects, verbs, contexts)])
                    
                    return pair_loss(
                        torch.cat([extracted_subjects, extracted_verbs, extracted_contexts]),
                        torch.cat([
                            model.subject_extractor(svc),
                            model.verb_extractor(svc),
                            model.context_extractor(svc)
                        ])
                    )
                
                else:
                    svc =  model.text_encoder([' '.join([s, v, c]) for s,v,c in zip(subjects, verbs, contexts)])
                    combine_svc = model.svc_combine([extracted_subjects, extracted_verbs, extracted_contexts])
                    return pair_loss(svc, combine_svc)
            
            
            # extract part, text only
            if 1:
                    loss_value = extractor_loss(from_image = True) # / 2 + extractor_loss(from_image = False) / 2
                    losses += [('extractor loss 1', 1.0, loss_value)]
                    
            # combine2
            if 1:
                loss_value = combine_loss(img_target = True)  / 2 + combine_loss(img_target = False) / 2 
                losses += [('svc combine loss 2', 1.0, loss_value)]
                
                
            #loss_value = extract_combine_loss(reverse = True)
            #losses += [('extractor combine loss', 0.5, loss_value)]
            
        # total loss
        total_loss = sum([loss_weight * loss_value for loss_name, loss_weight, loss_value in losses])
        assert(not torch.isnan(total_loss))
        losses += [('total training loss', None, total_loss)]
        # save losses
        for loss_name, loss_weight, loss_value in losses:
            if not losses_tracking.has_key(loss_name):
                losses_tracking[loss_name] = []
            losses_tracking[loss_name].append(loss_value.data.item())
        # backward & step
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

It 0 epoch 0 Elapsed time 0.3425


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.0007
  CocoCapTest text2image_recall_top1 0.0005
  SimpleImageCaptions112 text2image_recall_top1 0.0116
  SimpleImageCaptions112 svc_ttt_recall_top1 0.0237
  SimpleImageCaptions112 svc_ttt_recall_top5 0.0489
  SimpleImageCaptions112 svc_ttt_recall_top10 0.0699
  SimpleImageCaptions112 svc_tti_recall_top1 0.012
  SimpleImageCaptions112 svc_tti_recall_top5 0.0489
  SimpleImageCaptions112 svc_tti_recall_top10 0.1045
  SimpleImageCaptions112 svc_tit_recall_top1 0.012
  SimpleImageCaptions112 svc_tit_recall_top5 0.0699
  SimpleImageCaptions112 svc_tit_recall_top10 0.0699
  SimpleImageCaptions112 svc_tii_recall_top1 0.012
  SimpleImageCaptions112 svc_tii_recall_top5 0.0489
  SimpleImageCaptions112 svc_tii_recall_top10 0.0844
  SimpleImageCaptions112 svc_itt_recall_top1 0.0117
  SimpleImageCaptions112 svc_itt_recall_top5 0.0489
  SimpleImageCaptions112 svc_itt_recall_top10 0.0699
  SimpleImageCaptions112 svc_iti_recall_top1 0.0117
  SimpleImageCaptions1

100%|██████████| 2586/2586 [00:42<00:00, 60.53it/s]


It 2586 epoch 1 Elapsed time 76.154
    total training loss 0.8041
    joint_embedding 0.8041


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.1405
  CocoCapTest text2image_recall_top1 0.1242
  SimpleImageCaptions112 text2image_recall_top1 0.2018
  SimpleImageCaptions112 svc_ttt_recall_top1 0.0054
  SimpleImageCaptions112 svc_ttt_recall_top5 0.024
  SimpleImageCaptions112 svc_ttt_recall_top10 0.0573
  SimpleImageCaptions112 svc_tti_recall_top1 0.0054
  SimpleImageCaptions112 svc_tti_recall_top5 0.024
  SimpleImageCaptions112 svc_tti_recall_top10 0.0562
  SimpleImageCaptions112 svc_tit_recall_top1 0.0054
  SimpleImageCaptions112 svc_tit_recall_top5 0.024
  SimpleImageCaptions112 svc_tit_recall_top10 0.0283
  SimpleImageCaptions112 svc_tii_recall_top1 0.0054
  SimpleImageCaptions112 svc_tii_recall_top5 0.024
  SimpleImageCaptions112 svc_tii_recall_top10 0.0353
  SimpleImageCaptions112 svc_itt_recall_top1 0.0054
  SimpleImageCaptions112 svc_itt_recall_top5 0.024
  SimpleImageCaptions112 svc_itt_recall_top10 0.0446
  SimpleImageCaptions112 svc_iti_recall_top1 0.0054
  SimpleImageCaptions112

100%|██████████| 2586/2586 [04:09<00:00, 10.37it/s]


It 5172 epoch 2 Elapsed time 281.8792
    svc combine loss 2 0.2849
    total training loss 2.4246
    extractor loss 1 1.5718
    joint_embedding 0.5679


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.1824
  CocoCapTest text2image_recall_top1 0.1814
  SimpleImageCaptions112 text2image_recall_top1 0.2304
  SimpleImageCaptions112 svc_ttt_recall_top1 0.2437
  SimpleImageCaptions112 svc_ttt_recall_top5 0.6488
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8331
  SimpleImageCaptions112 svc_tti_recall_top1 0.0719
  SimpleImageCaptions112 svc_tti_recall_top5 0.2453
  SimpleImageCaptions112 svc_tti_recall_top10 0.3645
  SimpleImageCaptions112 svc_tit_recall_top1 0.1415
  SimpleImageCaptions112 svc_tit_recall_top5 0.4243
  SimpleImageCaptions112 svc_tit_recall_top10 0.5618
  SimpleImageCaptions112 svc_tii_recall_top1 0.0469
  SimpleImageCaptions112 svc_tii_recall_top5 0.194
  SimpleImageCaptions112 svc_tii_recall_top10 0.3131
  SimpleImageCaptions112 svc_itt_recall_top1 0.1915
  SimpleImageCaptions112 svc_itt_recall_top5 0.5286
  SimpleImageCaptions112 svc_itt_recall_top10 0.672
  SimpleImageCaptions112 svc_iti_recall_top1 0.0505
  SimpleImageCaptions

100%|██████████| 2586/2586 [04:07<00:00, 10.46it/s]


It 7758 epoch 3 Elapsed time 280.1595
    svc combine loss 2 0.242
    total training loss 2.1435
    extractor loss 1 1.4162
    joint_embedding 0.4853


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2152
  CocoCapTest text2image_recall_top1 0.1943
  SimpleImageCaptions112 text2image_recall_top1 0.233
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3178
  SimpleImageCaptions112 svc_ttt_recall_top5 0.6074
  SimpleImageCaptions112 svc_ttt_recall_top10 0.7941
  SimpleImageCaptions112 svc_tti_recall_top1 0.1306
  SimpleImageCaptions112 svc_tti_recall_top5 0.3772
  SimpleImageCaptions112 svc_tti_recall_top10 0.5102
  SimpleImageCaptions112 svc_tit_recall_top1 0.1414
  SimpleImageCaptions112 svc_tit_recall_top5 0.4295
  SimpleImageCaptions112 svc_tit_recall_top10 0.576
  SimpleImageCaptions112 svc_tii_recall_top1 0.0545
  SimpleImageCaptions112 svc_tii_recall_top5 0.2131
  SimpleImageCaptions112 svc_tii_recall_top10 0.3285
  SimpleImageCaptions112 svc_itt_recall_top1 0.2102
  SimpleImageCaptions112 svc_itt_recall_top5 0.5322
  SimpleImageCaptions112 svc_itt_recall_top10 0.6958
  SimpleImageCaptions112 svc_iti_recall_top1 0.0999
  SimpleImageCaptions

100%|██████████| 2586/2586 [04:08<00:00, 10.54it/s]


It 10344 epoch 4 Elapsed time 281.2953
    svc combine loss 2 0.2151
    total training loss 1.9834
    extractor loss 1 1.3386
    joint_embedding 0.4297


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2423
  CocoCapTest text2image_recall_top1 0.2058
  SimpleImageCaptions112 text2image_recall_top1 0.3009
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3158
  SimpleImageCaptions112 svc_ttt_recall_top5 0.6283
  SimpleImageCaptions112 svc_ttt_recall_top10 0.7778
  SimpleImageCaptions112 svc_tti_recall_top1 0.1214
  SimpleImageCaptions112 svc_tti_recall_top5 0.3497
  SimpleImageCaptions112 svc_tti_recall_top10 0.4625
  SimpleImageCaptions112 svc_tit_recall_top1 0.1489
  SimpleImageCaptions112 svc_tit_recall_top5 0.4064
  SimpleImageCaptions112 svc_tit_recall_top10 0.55
  SimpleImageCaptions112 svc_tii_recall_top1 0.0754
  SimpleImageCaptions112 svc_tii_recall_top5 0.2454
  SimpleImageCaptions112 svc_tii_recall_top10 0.3631
  SimpleImageCaptions112 svc_itt_recall_top1 0.2031
  SimpleImageCaptions112 svc_itt_recall_top5 0.5051
  SimpleImageCaptions112 svc_itt_recall_top10 0.6659
  SimpleImageCaptions112 svc_iti_recall_top1 0.0844
  SimpleImageCaptions

100%|██████████| 2586/2586 [04:04<00:00, 10.56it/s]


It 12930 epoch 5 Elapsed time 277.3875
    svc combine loss 2 0.2054
    total training loss 1.9108
    extractor loss 1 1.303
    joint_embedding 0.4024


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.252
  CocoCapTest text2image_recall_top1 0.2338
  SimpleImageCaptions112 text2image_recall_top1 0.2875
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3553
  SimpleImageCaptions112 svc_ttt_recall_top5 0.6893
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8613
  SimpleImageCaptions112 svc_tti_recall_top1 0.1456
  SimpleImageCaptions112 svc_tti_recall_top5 0.4143
  SimpleImageCaptions112 svc_tti_recall_top10 0.5767
  SimpleImageCaptions112 svc_tit_recall_top1 0.1622
  SimpleImageCaptions112 svc_tit_recall_top5 0.4178
  SimpleImageCaptions112 svc_tit_recall_top10 0.5539
  SimpleImageCaptions112 svc_tii_recall_top1 0.086
  SimpleImageCaptions112 svc_tii_recall_top5 0.2647
  SimpleImageCaptions112 svc_tii_recall_top10 0.4047
  SimpleImageCaptions112 svc_itt_recall_top1 0.2201
  SimpleImageCaptions112 svc_itt_recall_top5 0.577
  SimpleImageCaptions112 svc_itt_recall_top10 0.7484
  SimpleImageCaptions112 svc_iti_recall_top1 0.1193
  SimpleImageCaptions1

100%|██████████| 2586/2586 [04:05<00:00,  9.79it/s]


It 15516 epoch 6 Elapsed time 279.3127
    svc combine loss 2 0.1844
    total training loss 1.8293
    extractor loss 1 1.263
    joint_embedding 0.3819


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2635
  CocoCapTest text2image_recall_top1 0.2347
  SimpleImageCaptions112 text2image_recall_top1 0.267
  SimpleImageCaptions112 svc_ttt_recall_top1 0.2072
  SimpleImageCaptions112 svc_ttt_recall_top5 0.6356
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8216
  SimpleImageCaptions112 svc_tti_recall_top1 0.1404
  SimpleImageCaptions112 svc_tti_recall_top5 0.3987
  SimpleImageCaptions112 svc_tti_recall_top10 0.5496
  SimpleImageCaptions112 svc_tit_recall_top1 0.1594
  SimpleImageCaptions112 svc_tit_recall_top5 0.401
  SimpleImageCaptions112 svc_tit_recall_top10 0.534
  SimpleImageCaptions112 svc_tii_recall_top1 0.0863
  SimpleImageCaptions112 svc_tii_recall_top5 0.287
  SimpleImageCaptions112 svc_tii_recall_top10 0.412
  SimpleImageCaptions112 svc_itt_recall_top1 0.1922
  SimpleImageCaptions112 svc_itt_recall_top5 0.5332
  SimpleImageCaptions112 svc_itt_recall_top10 0.689
  SimpleImageCaptions112 svc_iti_recall_top1 0.1069
  SimpleImageCaptions112 

100%|██████████| 2586/2586 [04:04<00:00, 10.60it/s]


It 18102 epoch 7 Elapsed time 277.2155
    svc combine loss 2 0.1763
    total training loss 1.7854
    extractor loss 1 1.2357
    joint_embedding 0.3734


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2839
  CocoCapTest text2image_recall_top1 0.2416
  SimpleImageCaptions112 text2image_recall_top1 0.3116
  SimpleImageCaptions112 svc_ttt_recall_top1 0.2683
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7245
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8419
  SimpleImageCaptions112 svc_tti_recall_top1 0.1546
  SimpleImageCaptions112 svc_tti_recall_top5 0.3951
  SimpleImageCaptions112 svc_tti_recall_top10 0.5315
  SimpleImageCaptions112 svc_tit_recall_top1 0.139
  SimpleImageCaptions112 svc_tit_recall_top5 0.4006
  SimpleImageCaptions112 svc_tit_recall_top10 0.5484
  SimpleImageCaptions112 svc_tii_recall_top1 0.0697
  SimpleImageCaptions112 svc_tii_recall_top5 0.2352
  SimpleImageCaptions112 svc_tii_recall_top10 0.3658
  SimpleImageCaptions112 svc_itt_recall_top1 0.2366
  SimpleImageCaptions112 svc_itt_recall_top5 0.6305
  SimpleImageCaptions112 svc_itt_recall_top10 0.7914
  SimpleImageCaptions112 svc_iti_recall_top1 0.1256
  SimpleImageCaption

100%|██████████| 2586/2586 [04:08<00:00, 10.41it/s]


It 20688 epoch 8 Elapsed time 281.4295
    svc combine loss 2 0.1749
    total training loss 1.753
    extractor loss 1 1.2156
    joint_embedding 0.3625


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2808
  CocoCapTest text2image_recall_top1 0.2549
  SimpleImageCaptions112 text2image_recall_top1 0.2518
  SimpleImageCaptions112 svc_ttt_recall_top1 0.2599
  SimpleImageCaptions112 svc_ttt_recall_top5 0.602
  SimpleImageCaptions112 svc_ttt_recall_top10 0.766
  SimpleImageCaptions112 svc_tti_recall_top1 0.1262
  SimpleImageCaptions112 svc_tti_recall_top5 0.3878
  SimpleImageCaptions112 svc_tti_recall_top10 0.5449
  SimpleImageCaptions112 svc_tit_recall_top1 0.1417
  SimpleImageCaptions112 svc_tit_recall_top5 0.401
  SimpleImageCaptions112 svc_tit_recall_top10 0.5367
  SimpleImageCaptions112 svc_tii_recall_top1 0.0727
  SimpleImageCaptions112 svc_tii_recall_top5 0.2461
  SimpleImageCaptions112 svc_tii_recall_top10 0.3735
  SimpleImageCaptions112 svc_itt_recall_top1 0.2289
  SimpleImageCaptions112 svc_itt_recall_top5 0.6046
  SimpleImageCaptions112 svc_itt_recall_top10 0.7731
  SimpleImageCaptions112 svc_iti_recall_top1 0.1152
  SimpleImageCaptions1

100%|██████████| 2586/2586 [04:03<00:00, 10.62it/s]


It 23274 epoch 9 Elapsed time 276.5551
    svc combine loss 2 0.1667
    total training loss 1.71
    extractor loss 1 1.1935
    joint_embedding 0.3498


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3128
  CocoCapTest text2image_recall_top1 0.2803
  SimpleImageCaptions112 text2image_recall_top1 0.3464
  SimpleImageCaptions112 svc_ttt_recall_top1 0.357
  SimpleImageCaptions112 svc_ttt_recall_top5 0.6849
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8267
  SimpleImageCaptions112 svc_tti_recall_top1 0.133
  SimpleImageCaptions112 svc_tti_recall_top5 0.4072
  SimpleImageCaptions112 svc_tti_recall_top10 0.5581
  SimpleImageCaptions112 svc_tit_recall_top1 0.1621
  SimpleImageCaptions112 svc_tit_recall_top5 0.4392
  SimpleImageCaptions112 svc_tit_recall_top10 0.5837
  SimpleImageCaptions112 svc_tii_recall_top1 0.096
  SimpleImageCaptions112 svc_tii_recall_top5 0.3001
  SimpleImageCaptions112 svc_tii_recall_top10 0.4263
  SimpleImageCaptions112 svc_itt_recall_top1 0.2396
  SimpleImageCaptions112 svc_itt_recall_top5 0.5967
  SimpleImageCaptions112 svc_itt_recall_top10 0.7661
  SimpleImageCaptions112 svc_iti_recall_top1 0.1103
  SimpleImageCaptions1

100%|██████████| 2586/2586 [04:07<00:00, 10.85it/s]


It 25860 epoch 10 Elapsed time 280.4806
    svc combine loss 2 0.158
    total training loss 1.6774
    extractor loss 1 1.1779
    joint_embedding 0.3415


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3152
  CocoCapTest text2image_recall_top1 0.2691
  SimpleImageCaptions112 text2image_recall_top1 0.2625
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3374
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7151
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8851
  SimpleImageCaptions112 svc_tti_recall_top1 0.1251
  SimpleImageCaptions112 svc_tti_recall_top5 0.413
  SimpleImageCaptions112 svc_tti_recall_top10 0.5629
  SimpleImageCaptions112 svc_tit_recall_top1 0.1674
  SimpleImageCaptions112 svc_tit_recall_top5 0.51
  SimpleImageCaptions112 svc_tit_recall_top10 0.665
  SimpleImageCaptions112 svc_tii_recall_top1 0.0779
  SimpleImageCaptions112 svc_tii_recall_top5 0.2774
  SimpleImageCaptions112 svc_tii_recall_top10 0.4201
  SimpleImageCaptions112 svc_itt_recall_top1 0.2289
  SimpleImageCaptions112 svc_itt_recall_top5 0.6385
  SimpleImageCaptions112 svc_itt_recall_top10 0.8042
  SimpleImageCaptions112 svc_iti_recall_top1 0.1002
  SimpleImageCaptions11

100%|██████████| 2586/2586 [04:07<00:00, 10.47it/s]


It 28446 epoch 11 Elapsed time 280.0226
    svc combine loss 2 0.1559
    total training loss 1.6636
    extractor loss 1 1.1756
    joint_embedding 0.3321


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3061
  CocoCapTest text2image_recall_top1 0.2696
  SimpleImageCaptions112 text2image_recall_top1 0.3027
  SimpleImageCaptions112 svc_ttt_recall_top1 0.2412
  SimpleImageCaptions112 svc_ttt_recall_top5 0.6227
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8285
  SimpleImageCaptions112 svc_tti_recall_top1 0.1425
  SimpleImageCaptions112 svc_tti_recall_top5 0.4281
  SimpleImageCaptions112 svc_tti_recall_top10 0.5843
  SimpleImageCaptions112 svc_tit_recall_top1 0.1501
  SimpleImageCaptions112 svc_tit_recall_top5 0.3736
  SimpleImageCaptions112 svc_tit_recall_top10 0.5073
  SimpleImageCaptions112 svc_tii_recall_top1 0.0782
  SimpleImageCaptions112 svc_tii_recall_top5 0.2547
  SimpleImageCaptions112 svc_tii_recall_top10 0.3773
  SimpleImageCaptions112 svc_itt_recall_top1 0.2231
  SimpleImageCaptions112 svc_itt_recall_top5 0.553
  SimpleImageCaptions112 svc_itt_recall_top10 0.7209
  SimpleImageCaptions112 svc_iti_recall_top1 0.108
  SimpleImageCaptions

100%|██████████| 2586/2586 [04:06<00:00, 10.47it/s]


It 31032 epoch 12 Elapsed time 279.9527
    svc combine loss 2 0.1484
    total training loss 1.6206
    extractor loss 1 1.1538
    joint_embedding 0.3183


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.304
  CocoCapTest text2image_recall_top1 0.2768
  SimpleImageCaptions112 text2image_recall_top1 0.3295
  SimpleImageCaptions112 svc_ttt_recall_top1 0.2053
  SimpleImageCaptions112 svc_ttt_recall_top5 0.5958
  SimpleImageCaptions112 svc_ttt_recall_top10 0.7877
  SimpleImageCaptions112 svc_tti_recall_top1 0.1386
  SimpleImageCaptions112 svc_tti_recall_top5 0.4025
  SimpleImageCaptions112 svc_tti_recall_top10 0.5662
  SimpleImageCaptions112 svc_tit_recall_top1 0.1325
  SimpleImageCaptions112 svc_tit_recall_top5 0.3515
  SimpleImageCaptions112 svc_tit_recall_top10 0.4857
  SimpleImageCaptions112 svc_tii_recall_top1 0.0836
  SimpleImageCaptions112 svc_tii_recall_top5 0.2655
  SimpleImageCaptions112 svc_tii_recall_top10 0.3831
  SimpleImageCaptions112 svc_itt_recall_top1 0.2112
  SimpleImageCaptions112 svc_itt_recall_top5 0.5304
  SimpleImageCaptions112 svc_itt_recall_top10 0.6884
  SimpleImageCaptions112 svc_iti_recall_top1 0.1106
  SimpleImageCaption

100%|██████████| 2586/2586 [04:08<00:00, 10.41it/s]


It 33618 epoch 13 Elapsed time 292.2694
    svc combine loss 2 0.1444
    total training loss 1.6159
    extractor loss 1 1.1431
    joint_embedding 0.3284


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3201
  CocoCapTest text2image_recall_top1 0.2642
  SimpleImageCaptions112 text2image_recall_top1 0.3911
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3487
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7994
  SimpleImageCaptions112 svc_ttt_recall_top10 0.904
  SimpleImageCaptions112 svc_tti_recall_top1 0.1428
  SimpleImageCaptions112 svc_tti_recall_top5 0.4175
  SimpleImageCaptions112 svc_tti_recall_top10 0.569
  SimpleImageCaptions112 svc_tit_recall_top1 0.1819
  SimpleImageCaptions112 svc_tit_recall_top5 0.4887
  SimpleImageCaptions112 svc_tit_recall_top10 0.6499
  SimpleImageCaptions112 svc_tii_recall_top1 0.0864
  SimpleImageCaptions112 svc_tii_recall_top5 0.2858
  SimpleImageCaptions112 svc_tii_recall_top10 0.4301
  SimpleImageCaptions112 svc_itt_recall_top1 0.2468
  SimpleImageCaptions112 svc_itt_recall_top5 0.5789
  SimpleImageCaptions112 svc_itt_recall_top10 0.746
  SimpleImageCaptions112 svc_iti_recall_top1 0.1123
  SimpleImageCaptions1

100%|██████████| 2586/2586 [04:06<00:00,  9.91it/s]


It 36204 epoch 14 Elapsed time 289.3238
    svc combine loss 2 0.1396
    total training loss 1.5964
    extractor loss 1 1.1357
    joint_embedding 0.321


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3453
  CocoCapTest text2image_recall_top1 0.2724
  SimpleImageCaptions112 text2image_recall_top1 0.3509
  SimpleImageCaptions112 svc_ttt_recall_top1 0.2957
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7235
  SimpleImageCaptions112 svc_ttt_recall_top10 0.879
  SimpleImageCaptions112 svc_tti_recall_top1 0.1624
  SimpleImageCaptions112 svc_tti_recall_top5 0.4506
  SimpleImageCaptions112 svc_tti_recall_top10 0.6064
  SimpleImageCaptions112 svc_tit_recall_top1 0.1965
  SimpleImageCaptions112 svc_tit_recall_top5 0.517
  SimpleImageCaptions112 svc_tit_recall_top10 0.665
  SimpleImageCaptions112 svc_tii_recall_top1 0.0887
  SimpleImageCaptions112 svc_tii_recall_top5 0.2654
  SimpleImageCaptions112 svc_tii_recall_top10 0.386
  SimpleImageCaptions112 svc_itt_recall_top1 0.2455
  SimpleImageCaptions112 svc_itt_recall_top5 0.6043
  SimpleImageCaptions112 svc_itt_recall_top10 0.7573
  SimpleImageCaptions112 svc_iti_recall_top1 0.1136
  SimpleImageCaptions11

100%|██████████| 2586/2586 [04:04<00:00, 10.58it/s]


It 38790 epoch 15 Elapsed time 286.9932
    svc combine loss 2 0.1397
    total training loss 1.5852
    extractor loss 1 1.1359
    joint_embedding 0.3097


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3122
  CocoCapTest text2image_recall_top1 0.2795
  SimpleImageCaptions112 text2image_recall_top1 0.2848
  SimpleImageCaptions112 svc_ttt_recall_top1 0.2889
  SimpleImageCaptions112 svc_ttt_recall_top5 0.6818
  SimpleImageCaptions112 svc_ttt_recall_top10 0.881
  SimpleImageCaptions112 svc_tti_recall_top1 0.1814
  SimpleImageCaptions112 svc_tti_recall_top5 0.4716
  SimpleImageCaptions112 svc_tti_recall_top10 0.6157
  SimpleImageCaptions112 svc_tit_recall_top1 0.1446
  SimpleImageCaptions112 svc_tit_recall_top5 0.4146
  SimpleImageCaptions112 svc_tit_recall_top10 0.5699
  SimpleImageCaptions112 svc_tii_recall_top1 0.0888
  SimpleImageCaptions112 svc_tii_recall_top5 0.275
  SimpleImageCaptions112 svc_tii_recall_top10 0.3942
  SimpleImageCaptions112 svc_itt_recall_top1 0.2583
  SimpleImageCaptions112 svc_itt_recall_top5 0.6306
  SimpleImageCaptions112 svc_itt_recall_top10 0.7771
  SimpleImageCaptions112 svc_iti_recall_top1 0.1405
  SimpleImageCaptions

100%|██████████| 2586/2586 [04:03<00:00, 10.63it/s]


It 41376 epoch 16 Elapsed time 277.5809
    svc combine loss 2 0.136
    total training loss 1.5691
    extractor loss 1 1.1262
    joint_embedding 0.3069


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3279
  CocoCapTest text2image_recall_top1 0.2795
  SimpleImageCaptions112 text2image_recall_top1 0.2589
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3087
  SimpleImageCaptions112 svc_ttt_recall_top5 0.76
  SimpleImageCaptions112 svc_ttt_recall_top10 0.902
  SimpleImageCaptions112 svc_tti_recall_top1 0.2034
  SimpleImageCaptions112 svc_tti_recall_top5 0.5268
  SimpleImageCaptions112 svc_tti_recall_top10 0.6997
  SimpleImageCaptions112 svc_tit_recall_top1 0.179
  SimpleImageCaptions112 svc_tit_recall_top5 0.4381
  SimpleImageCaptions112 svc_tit_recall_top10 0.5959
  SimpleImageCaptions112 svc_tii_recall_top1 0.1093
  SimpleImageCaptions112 svc_tii_recall_top5 0.3375
  SimpleImageCaptions112 svc_tii_recall_top10 0.4771
  SimpleImageCaptions112 svc_itt_recall_top1 0.2474
  SimpleImageCaptions112 svc_itt_recall_top5 0.6293
  SimpleImageCaptions112 svc_itt_recall_top10 0.7867
  SimpleImageCaptions112 svc_iti_recall_top1 0.1466
  SimpleImageCaptions11

100%|██████████| 2586/2586 [04:08<00:00, 10.40it/s]


It 43962 epoch 17 Elapsed time 281.2907
    svc combine loss 2 0.1384
    total training loss 1.5591
    extractor loss 1 1.1113
    joint_embedding 0.3094


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3428
  CocoCapTest text2image_recall_top1 0.2728
  SimpleImageCaptions112 text2image_recall_top1 0.2643
  SimpleImageCaptions112 svc_ttt_recall_top1 0.2485
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7188
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8495
  SimpleImageCaptions112 svc_tti_recall_top1 0.1513
  SimpleImageCaptions112 svc_tti_recall_top5 0.4411
  SimpleImageCaptions112 svc_tti_recall_top10 0.6053
  SimpleImageCaptions112 svc_tit_recall_top1 0.143
  SimpleImageCaptions112 svc_tit_recall_top5 0.3957
  SimpleImageCaptions112 svc_tit_recall_top10 0.5605
  SimpleImageCaptions112 svc_tii_recall_top1 0.0833
  SimpleImageCaptions112 svc_tii_recall_top5 0.2905
  SimpleImageCaptions112 svc_tii_recall_top10 0.4293
  SimpleImageCaptions112 svc_itt_recall_top1 0.2263
  SimpleImageCaptions112 svc_itt_recall_top5 0.6241
  SimpleImageCaptions112 svc_itt_recall_top10 0.7691
  SimpleImageCaptions112 svc_iti_recall_top1 0.1246
  SimpleImageCaption

100%|██████████| 2586/2586 [04:03<00:00, 11.04it/s]


It 46548 epoch 18 Elapsed time 275.9551
    svc combine loss 2 0.1267
    total training loss 1.5318
    extractor loss 1 1.1077
    joint_embedding 0.2975


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3445
  CocoCapTest text2image_recall_top1 0.2824
  SimpleImageCaptions112 text2image_recall_top1 0.3812
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3319
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7766
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8647
  SimpleImageCaptions112 svc_tti_recall_top1 0.1554
  SimpleImageCaptions112 svc_tti_recall_top5 0.4227
  SimpleImageCaptions112 svc_tti_recall_top10 0.5658
  SimpleImageCaptions112 svc_tit_recall_top1 0.1843
  SimpleImageCaptions112 svc_tit_recall_top5 0.4406
  SimpleImageCaptions112 svc_tit_recall_top10 0.5779
  SimpleImageCaptions112 svc_tii_recall_top1 0.0927
  SimpleImageCaptions112 svc_tii_recall_top5 0.2723
  SimpleImageCaptions112 svc_tii_recall_top10 0.3887
  SimpleImageCaptions112 svc_itt_recall_top1 0.2362
  SimpleImageCaptions112 svc_itt_recall_top5 0.6283
  SimpleImageCaptions112 svc_itt_recall_top10 0.7791
  SimpleImageCaptions112 svc_iti_recall_top1 0.1347
  SimpleImageCaptio

100%|██████████| 2586/2586 [04:09<00:00, 10.38it/s]


It 49134 epoch 19 Elapsed time 289.9097
    svc combine loss 2 0.1304
    total training loss 1.5369
    extractor loss 1 1.1067
    joint_embedding 0.2999


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3369
  CocoCapTest text2image_recall_top1 0.2964
  SimpleImageCaptions112 text2image_recall_top1 0.3125
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3053
  SimpleImageCaptions112 svc_ttt_recall_top5 0.738
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8793
  SimpleImageCaptions112 svc_tti_recall_top1 0.2075
  SimpleImageCaptions112 svc_tti_recall_top5 0.5274
  SimpleImageCaptions112 svc_tti_recall_top10 0.672
  SimpleImageCaptions112 svc_tit_recall_top1 0.1831
  SimpleImageCaptions112 svc_tit_recall_top5 0.4473
  SimpleImageCaptions112 svc_tit_recall_top10 0.596
  SimpleImageCaptions112 svc_tii_recall_top1 0.1074
  SimpleImageCaptions112 svc_tii_recall_top5 0.3181
  SimpleImageCaptions112 svc_tii_recall_top10 0.4387
  SimpleImageCaptions112 svc_itt_recall_top1 0.2468
  SimpleImageCaptions112 svc_itt_recall_top5 0.6321
  SimpleImageCaptions112 svc_itt_recall_top10 0.7736
  SimpleImageCaptions112 svc_iti_recall_top1 0.1448
  SimpleImageCaptions1

100%|██████████| 2586/2586 [04:02<00:00, 10.52it/s]


It 51720 epoch 20 Elapsed time 285.1352
    svc combine loss 2 0.1058
    total training loss 1.389
    extractor loss 1 1.0258
    joint_embedding 0.2573


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3647
  CocoCapTest text2image_recall_top1 0.3099
  SimpleImageCaptions112 text2image_recall_top1 0.3027
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3597
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8068
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9022
  SimpleImageCaptions112 svc_tti_recall_top1 0.2318
  SimpleImageCaptions112 svc_tti_recall_top5 0.5393
  SimpleImageCaptions112 svc_tti_recall_top10 0.6844
  SimpleImageCaptions112 svc_tit_recall_top1 0.2084
  SimpleImageCaptions112 svc_tit_recall_top5 0.499
  SimpleImageCaptions112 svc_tit_recall_top10 0.6323
  SimpleImageCaptions112 svc_tii_recall_top1 0.1185
  SimpleImageCaptions112 svc_tii_recall_top5 0.3364
  SimpleImageCaptions112 svc_tii_recall_top10 0.4751
  SimpleImageCaptions112 svc_itt_recall_top1 0.2808
  SimpleImageCaptions112 svc_itt_recall_top5 0.6711
  SimpleImageCaptions112 svc_itt_recall_top10 0.8088
  SimpleImageCaptions112 svc_iti_recall_top1 0.1567
  SimpleImageCaption

100%|██████████| 2586/2586 [04:06<00:00, 10.74it/s]


It 54306 epoch 21 Elapsed time 287.8113
    svc combine loss 2 0.0977
    total training loss 1.3203
    extractor loss 1 0.9798
    joint_embedding 0.2428


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3771
  CocoCapTest text2image_recall_top1 0.3166
  SimpleImageCaptions112 text2image_recall_top1 0.3446
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3975
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7809
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8742
  SimpleImageCaptions112 svc_tti_recall_top1 0.2263
  SimpleImageCaptions112 svc_tti_recall_top5 0.5079
  SimpleImageCaptions112 svc_tti_recall_top10 0.6588
  SimpleImageCaptions112 svc_tit_recall_top1 0.2073
  SimpleImageCaptions112 svc_tit_recall_top5 0.5051
  SimpleImageCaptions112 svc_tit_recall_top10 0.642
  SimpleImageCaptions112 svc_tii_recall_top1 0.1282
  SimpleImageCaptions112 svc_tii_recall_top5 0.35
  SimpleImageCaptions112 svc_tii_recall_top10 0.486
  SimpleImageCaptions112 svc_itt_recall_top1 0.2851
  SimpleImageCaptions112 svc_itt_recall_top5 0.6654
  SimpleImageCaptions112 svc_itt_recall_top10 0.8053
  SimpleImageCaptions112 svc_iti_recall_top1 0.1604
  SimpleImageCaptions11

100%|██████████| 2586/2586 [04:05<00:00, 10.54it/s]


It 56892 epoch 22 Elapsed time 288.6727
    svc combine loss 2 0.0922
    total training loss 1.301
    extractor loss 1 0.9731
    joint_embedding 0.2357


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3732
  CocoCapTest text2image_recall_top1 0.3178
  SimpleImageCaptions112 text2image_recall_top1 0.3116
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3304
  SimpleImageCaptions112 svc_ttt_recall_top5 0.755
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9027
  SimpleImageCaptions112 svc_tti_recall_top1 0.2297
  SimpleImageCaptions112 svc_tti_recall_top5 0.5214
  SimpleImageCaptions112 svc_tti_recall_top10 0.6737
  SimpleImageCaptions112 svc_tit_recall_top1 0.1947
  SimpleImageCaptions112 svc_tit_recall_top5 0.4626
  SimpleImageCaptions112 svc_tit_recall_top10 0.6084
  SimpleImageCaptions112 svc_tii_recall_top1 0.1109
  SimpleImageCaptions112 svc_tii_recall_top5 0.3298
  SimpleImageCaptions112 svc_tii_recall_top10 0.4745
  SimpleImageCaptions112 svc_itt_recall_top1 0.2873
  SimpleImageCaptions112 svc_itt_recall_top5 0.6514
  SimpleImageCaptions112 svc_itt_recall_top10 0.7915
  SimpleImageCaptions112 svc_iti_recall_top1 0.1621
  SimpleImageCaption

100%|██████████| 2586/2586 [04:03<00:00, 10.63it/s]


It 59478 epoch 23 Elapsed time 286.2472
    svc combine loss 2 0.0895
    total training loss 1.2763
    extractor loss 1 0.9664
    joint_embedding 0.2204


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3951
  CocoCapTest text2image_recall_top1 0.3239
  SimpleImageCaptions112 text2image_recall_top1 0.3652
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3002
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7874
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8784
  SimpleImageCaptions112 svc_tti_recall_top1 0.2167
  SimpleImageCaptions112 svc_tti_recall_top5 0.5387
  SimpleImageCaptions112 svc_tti_recall_top10 0.691
  SimpleImageCaptions112 svc_tit_recall_top1 0.1994
  SimpleImageCaptions112 svc_tit_recall_top5 0.482
  SimpleImageCaptions112 svc_tit_recall_top10 0.6357
  SimpleImageCaptions112 svc_tii_recall_top1 0.1164
  SimpleImageCaptions112 svc_tii_recall_top5 0.3464
  SimpleImageCaptions112 svc_tii_recall_top10 0.4974
  SimpleImageCaptions112 svc_itt_recall_top1 0.2867
  SimpleImageCaptions112 svc_itt_recall_top5 0.6468
  SimpleImageCaptions112 svc_itt_recall_top10 0.7922
  SimpleImageCaptions112 svc_iti_recall_top1 0.1586
  SimpleImageCaptions

100%|██████████| 2586/2586 [04:08<00:00, 10.42it/s]


It 62064 epoch 24 Elapsed time 290.6112
    svc combine loss 2 0.0858
    total training loss 1.254
    extractor loss 1 0.9464
    joint_embedding 0.2218


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.401
  CocoCapTest text2image_recall_top1 0.3139
  SimpleImageCaptions112 text2image_recall_top1 0.3536
  SimpleImageCaptions112 svc_ttt_recall_top1 0.335
  SimpleImageCaptions112 svc_ttt_recall_top5 0.753
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8745
  SimpleImageCaptions112 svc_tti_recall_top1 0.2414
  SimpleImageCaptions112 svc_tti_recall_top5 0.5923
  SimpleImageCaptions112 svc_tti_recall_top10 0.7451
  SimpleImageCaptions112 svc_tit_recall_top1 0.2004
  SimpleImageCaptions112 svc_tit_recall_top5 0.5024
  SimpleImageCaptions112 svc_tit_recall_top10 0.6377
  SimpleImageCaptions112 svc_tii_recall_top1 0.1216
  SimpleImageCaptions112 svc_tii_recall_top5 0.358
  SimpleImageCaptions112 svc_tii_recall_top10 0.5054
  SimpleImageCaptions112 svc_itt_recall_top1 0.2895
  SimpleImageCaptions112 svc_itt_recall_top5 0.6829
  SimpleImageCaptions112 svc_itt_recall_top10 0.8142
  SimpleImageCaptions112 svc_iti_recall_top1 0.1665
  SimpleImageCaptions11

100%|██████████| 2586/2586 [04:06<00:00, 10.86it/s]


It 64650 epoch 25 Elapsed time 289.9528
    svc combine loss 2 0.0851
    total training loss 1.2465
    extractor loss 1 0.9458
    joint_embedding 0.2155


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3992
  CocoCapTest text2image_recall_top1 0.3179
  SimpleImageCaptions112 text2image_recall_top1 0.3321
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3903
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7758
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9183
  SimpleImageCaptions112 svc_tti_recall_top1 0.2266
  SimpleImageCaptions112 svc_tti_recall_top5 0.5422
  SimpleImageCaptions112 svc_tti_recall_top10 0.7022
  SimpleImageCaptions112 svc_tit_recall_top1 0.1957
  SimpleImageCaptions112 svc_tit_recall_top5 0.5121
  SimpleImageCaptions112 svc_tit_recall_top10 0.6561
  SimpleImageCaptions112 svc_tii_recall_top1 0.1287
  SimpleImageCaptions112 svc_tii_recall_top5 0.3661
  SimpleImageCaptions112 svc_tii_recall_top10 0.5096
  SimpleImageCaptions112 svc_itt_recall_top1 0.2873
  SimpleImageCaptions112 svc_itt_recall_top5 0.666
  SimpleImageCaptions112 svc_itt_recall_top10 0.8057
  SimpleImageCaptions112 svc_iti_recall_top1 0.1623
  SimpleImageCaption

100%|██████████| 2586/2586 [04:04<00:00,  9.82it/s]


It 67236 epoch 26 Elapsed time 288.1647
    svc combine loss 2 0.0846
    total training loss 1.2391
    extractor loss 1 0.94
    joint_embedding 0.2146


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4119
  CocoCapTest text2image_recall_top1 0.32
  SimpleImageCaptions112 text2image_recall_top1 0.3429
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3591
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7806
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9185
  SimpleImageCaptions112 svc_tti_recall_top1 0.2194
  SimpleImageCaptions112 svc_tti_recall_top5 0.5413
  SimpleImageCaptions112 svc_tti_recall_top10 0.7
  SimpleImageCaptions112 svc_tit_recall_top1 0.207
  SimpleImageCaptions112 svc_tit_recall_top5 0.4999
  SimpleImageCaptions112 svc_tit_recall_top10 0.6426
  SimpleImageCaptions112 svc_tii_recall_top1 0.1275
  SimpleImageCaptions112 svc_tii_recall_top5 0.3643
  SimpleImageCaptions112 svc_tii_recall_top10 0.5012
  SimpleImageCaptions112 svc_itt_recall_top1 0.3
  SimpleImageCaptions112 svc_itt_recall_top5 0.6728
  SimpleImageCaptions112 svc_itt_recall_top10 0.8101
  SimpleImageCaptions112 svc_iti_recall_top1 0.1605
  SimpleImageCaptions112 svc

100%|██████████| 2586/2586 [04:08<00:00, 10.27it/s]


It 69822 epoch 27 Elapsed time 292.0722
    svc combine loss 2 0.0816
    total training loss 1.2301
    extractor loss 1 0.9367
    joint_embedding 0.2118


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4006
  CocoCapTest text2image_recall_top1 0.3385
  SimpleImageCaptions112 text2image_recall_top1 0.3473
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3391
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7449
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8943
  SimpleImageCaptions112 svc_tti_recall_top1 0.2595
  SimpleImageCaptions112 svc_tti_recall_top5 0.609
  SimpleImageCaptions112 svc_tti_recall_top10 0.7519
  SimpleImageCaptions112 svc_tit_recall_top1 0.1889
  SimpleImageCaptions112 svc_tit_recall_top5 0.45
  SimpleImageCaptions112 svc_tit_recall_top10 0.593
  SimpleImageCaptions112 svc_tii_recall_top1 0.1237
  SimpleImageCaptions112 svc_tii_recall_top5 0.3525
  SimpleImageCaptions112 svc_tii_recall_top10 0.4935
  SimpleImageCaptions112 svc_itt_recall_top1 0.2939
  SimpleImageCaptions112 svc_itt_recall_top5 0.67
  SimpleImageCaptions112 svc_itt_recall_top10 0.8115
  SimpleImageCaptions112 svc_iti_recall_top1 0.1743
  SimpleImageCaptions112 

100%|██████████| 2586/2586 [04:05<00:00, 10.61it/s]


It 72408 epoch 28 Elapsed time 289.2632
    svc combine loss 2 0.0839
    total training loss 1.2264
    extractor loss 1 0.9307
    joint_embedding 0.2117


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4037
  CocoCapTest text2image_recall_top1 0.3347
  SimpleImageCaptions112 text2image_recall_top1 0.308
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3452
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8413
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9312
  SimpleImageCaptions112 svc_tti_recall_top1 0.2227
  SimpleImageCaptions112 svc_tti_recall_top5 0.5212
  SimpleImageCaptions112 svc_tti_recall_top10 0.6738
  SimpleImageCaptions112 svc_tit_recall_top1 0.1914
  SimpleImageCaptions112 svc_tit_recall_top5 0.4991
  SimpleImageCaptions112 svc_tit_recall_top10 0.6434
  SimpleImageCaptions112 svc_tii_recall_top1 0.1171
  SimpleImageCaptions112 svc_tii_recall_top5 0.3427
  SimpleImageCaptions112 svc_tii_recall_top10 0.4851
  SimpleImageCaptions112 svc_itt_recall_top1 0.2921
  SimpleImageCaptions112 svc_itt_recall_top5 0.6692
  SimpleImageCaptions112 svc_itt_recall_top10 0.808
  SimpleImageCaptions112 svc_iti_recall_top1 0.1583
  SimpleImageCaptions

100%|██████████| 2586/2586 [04:03<00:00, 10.61it/s]


It 74994 epoch 29 Elapsed time 285.6285
    svc combine loss 2 0.0802
    total training loss 1.2105
    extractor loss 1 0.924
    joint_embedding 0.2063


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4151
  CocoCapTest text2image_recall_top1 0.3455
  SimpleImageCaptions112 text2image_recall_top1 0.3018
  SimpleImageCaptions112 svc_ttt_recall_top1 0.342
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7906
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9256
  SimpleImageCaptions112 svc_tti_recall_top1 0.2152
  SimpleImageCaptions112 svc_tti_recall_top5 0.5349
  SimpleImageCaptions112 svc_tti_recall_top10 0.6914
  SimpleImageCaptions112 svc_tit_recall_top1 0.1883
  SimpleImageCaptions112 svc_tit_recall_top5 0.5073
  SimpleImageCaptions112 svc_tit_recall_top10 0.6651
  SimpleImageCaptions112 svc_tii_recall_top1 0.1256
  SimpleImageCaptions112 svc_tii_recall_top5 0.3524
  SimpleImageCaptions112 svc_tii_recall_top10 0.4996
  SimpleImageCaptions112 svc_itt_recall_top1 0.2776
  SimpleImageCaptions112 svc_itt_recall_top5 0.6577
  SimpleImageCaptions112 svc_itt_recall_top10 0.8075
  SimpleImageCaptions112 svc_iti_recall_top1 0.1542
  SimpleImageCaption

100%|██████████| 2586/2586 [04:06<00:00, 10.57it/s]


It 77580 epoch 30 Elapsed time 288.6104
    svc combine loss 2 0.0757
    total training loss 1.1954
    extractor loss 1 0.9202
    joint_embedding 0.1995


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4147
  CocoCapTest text2image_recall_top1 0.3268
  SimpleImageCaptions112 text2image_recall_top1 0.3348
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3162
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8059
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9099
  SimpleImageCaptions112 svc_tti_recall_top1 0.2177
  SimpleImageCaptions112 svc_tti_recall_top5 0.5345
  SimpleImageCaptions112 svc_tti_recall_top10 0.684
  SimpleImageCaptions112 svc_tit_recall_top1 0.2076
  SimpleImageCaptions112 svc_tit_recall_top5 0.5115
  SimpleImageCaptions112 svc_tit_recall_top10 0.6653
  SimpleImageCaptions112 svc_tii_recall_top1 0.1216
  SimpleImageCaptions112 svc_tii_recall_top5 0.3481
  SimpleImageCaptions112 svc_tii_recall_top10 0.4849
  SimpleImageCaptions112 svc_itt_recall_top1 0.3067
  SimpleImageCaptions112 svc_itt_recall_top5 0.6778
  SimpleImageCaptions112 svc_itt_recall_top10 0.8117
  SimpleImageCaptions112 svc_iti_recall_top1 0.1684
  SimpleImageCaption

100%|██████████| 2586/2586 [04:03<00:00, 10.62it/s]


It 80166 epoch 31 Elapsed time 284.0558
    svc combine loss 2 0.077
    total training loss 1.191
    extractor loss 1 0.9103
    joint_embedding 0.2037


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4259
  CocoCapTest text2image_recall_top1 0.3329
  SimpleImageCaptions112 text2image_recall_top1 0.3402
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3711
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8035
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8967
  SimpleImageCaptions112 svc_tti_recall_top1 0.2482
  SimpleImageCaptions112 svc_tti_recall_top5 0.5644
  SimpleImageCaptions112 svc_tti_recall_top10 0.7126
  SimpleImageCaptions112 svc_tit_recall_top1 0.225
  SimpleImageCaptions112 svc_tit_recall_top5 0.5172
  SimpleImageCaptions112 svc_tit_recall_top10 0.6696
  SimpleImageCaptions112 svc_tii_recall_top1 0.131
  SimpleImageCaptions112 svc_tii_recall_top5 0.3604
  SimpleImageCaptions112 svc_tii_recall_top10 0.4984
  SimpleImageCaptions112 svc_itt_recall_top1 0.3004
  SimpleImageCaptions112 svc_itt_recall_top5 0.6748
  SimpleImageCaptions112 svc_itt_recall_top10 0.8164
  SimpleImageCaptions112 svc_iti_recall_top1 0.1649
  SimpleImageCaptions

100%|██████████| 2586/2586 [04:03<00:00, 10.49it/s]


It 82752 epoch 32 Elapsed time 284.3683
    svc combine loss 2 0.0762
    total training loss 1.1823
    extractor loss 1 0.9058
    joint_embedding 0.2003


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4124
  CocoCapTest text2image_recall_top1 0.3292
  SimpleImageCaptions112 text2image_recall_top1 0.3143
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3245
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8441
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9469
  SimpleImageCaptions112 svc_tti_recall_top1 0.2496
  SimpleImageCaptions112 svc_tti_recall_top5 0.5597
  SimpleImageCaptions112 svc_tti_recall_top10 0.7117
  SimpleImageCaptions112 svc_tit_recall_top1 0.1971
  SimpleImageCaptions112 svc_tit_recall_top5 0.5094
  SimpleImageCaptions112 svc_tit_recall_top10 0.6717
  SimpleImageCaptions112 svc_tii_recall_top1 0.1253
  SimpleImageCaptions112 svc_tii_recall_top5 0.354
  SimpleImageCaptions112 svc_tii_recall_top10 0.4918
  SimpleImageCaptions112 svc_itt_recall_top1 0.2842
  SimpleImageCaptions112 svc_itt_recall_top5 0.6665
  SimpleImageCaptions112 svc_itt_recall_top10 0.811
  SimpleImageCaptions112 svc_iti_recall_top1 0.1592
  SimpleImageCaptions

100%|██████████| 2586/2586 [04:05<00:00, 10.54it/s]


It 85338 epoch 33 Elapsed time 286.6426
    svc combine loss 2 0.0729
    total training loss 1.1733
    extractor loss 1 0.9
    joint_embedding 0.2004


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4323
  CocoCapTest text2image_recall_top1 0.3434
  SimpleImageCaptions112 text2image_recall_top1 0.3009
  SimpleImageCaptions112 svc_ttt_recall_top1 0.4254
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8483
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9065
  SimpleImageCaptions112 svc_tti_recall_top1 0.2408
  SimpleImageCaptions112 svc_tti_recall_top5 0.5762
  SimpleImageCaptions112 svc_tti_recall_top10 0.7356
  SimpleImageCaptions112 svc_tit_recall_top1 0.1907
  SimpleImageCaptions112 svc_tit_recall_top5 0.4977
  SimpleImageCaptions112 svc_tit_recall_top10 0.6448
  SimpleImageCaptions112 svc_tii_recall_top1 0.1216
  SimpleImageCaptions112 svc_tii_recall_top5 0.3386
  SimpleImageCaptions112 svc_tii_recall_top10 0.481
  SimpleImageCaptions112 svc_itt_recall_top1 0.2711
  SimpleImageCaptions112 svc_itt_recall_top5 0.657
  SimpleImageCaptions112 svc_itt_recall_top10 0.8073
  SimpleImageCaptions112 svc_iti_recall_top1 0.1588
  SimpleImageCaptions

100%|██████████| 2586/2586 [04:07<00:00, 10.46it/s]


It 87924 epoch 34 Elapsed time 288.76
    svc combine loss 2 0.0737
    total training loss 1.1687
    extractor loss 1 0.9019
    joint_embedding 0.1931


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4139
  CocoCapTest text2image_recall_top1 0.3301
  SimpleImageCaptions112 text2image_recall_top1 0.3464
  SimpleImageCaptions112 svc_ttt_recall_top1 0.4316
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8343
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9298
  SimpleImageCaptions112 svc_tti_recall_top1 0.2371
  SimpleImageCaptions112 svc_tti_recall_top5 0.5742
  SimpleImageCaptions112 svc_tti_recall_top10 0.7231
  SimpleImageCaptions112 svc_tit_recall_top1 0.2105
  SimpleImageCaptions112 svc_tit_recall_top5 0.5038
  SimpleImageCaptions112 svc_tit_recall_top10 0.6717
  SimpleImageCaptions112 svc_tii_recall_top1 0.1341
  SimpleImageCaptions112 svc_tii_recall_top5 0.367
  SimpleImageCaptions112 svc_tii_recall_top10 0.5008
  SimpleImageCaptions112 svc_itt_recall_top1 0.2751
  SimpleImageCaptions112 svc_itt_recall_top5 0.6583
  SimpleImageCaptions112 svc_itt_recall_top10 0.8034
  SimpleImageCaptions112 svc_iti_recall_top1 0.1573
  SimpleImageCaption

100%|██████████| 2586/2586 [04:07<00:00, 10.44it/s]


It 90510 epoch 35 Elapsed time 289.0734
    svc combine loss 2 0.0733
    total training loss 1.1604
    extractor loss 1 0.8968
    joint_embedding 0.1903


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4189
  CocoCapTest text2image_recall_top1 0.3481
  SimpleImageCaptions112 text2image_recall_top1 0.2723
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3476
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7698
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8878
  SimpleImageCaptions112 svc_tti_recall_top1 0.235
  SimpleImageCaptions112 svc_tti_recall_top5 0.5687
  SimpleImageCaptions112 svc_tti_recall_top10 0.715
  SimpleImageCaptions112 svc_tit_recall_top1 0.1986
  SimpleImageCaptions112 svc_tit_recall_top5 0.5054
  SimpleImageCaptions112 svc_tit_recall_top10 0.6598
  SimpleImageCaptions112 svc_tii_recall_top1 0.126
  SimpleImageCaptions112 svc_tii_recall_top5 0.3499
  SimpleImageCaptions112 svc_tii_recall_top10 0.4892
  SimpleImageCaptions112 svc_itt_recall_top1 0.27
  SimpleImageCaptions112 svc_itt_recall_top5 0.6556
  SimpleImageCaptions112 svc_itt_recall_top10 0.8053
  SimpleImageCaptions112 svc_iti_recall_top1 0.1559
  SimpleImageCaptions112

100%|██████████| 2586/2586 [04:07<00:00, 10.95it/s]


It 93096 epoch 36 Elapsed time 290.0877
    svc combine loss 2 0.0724
    total training loss 1.1553
    extractor loss 1 0.8969
    joint_embedding 0.186


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4276
  CocoCapTest text2image_recall_top1 0.3492
  SimpleImageCaptions112 text2image_recall_top1 0.2955
  SimpleImageCaptions112 svc_ttt_recall_top1 0.4086
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7846
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9209
  SimpleImageCaptions112 svc_tti_recall_top1 0.2245
  SimpleImageCaptions112 svc_tti_recall_top5 0.545
  SimpleImageCaptions112 svc_tti_recall_top10 0.6939
  SimpleImageCaptions112 svc_tit_recall_top1 0.2156
  SimpleImageCaptions112 svc_tit_recall_top5 0.5372
  SimpleImageCaptions112 svc_tit_recall_top10 0.6839
  SimpleImageCaptions112 svc_tii_recall_top1 0.1339
  SimpleImageCaptions112 svc_tii_recall_top5 0.3499
  SimpleImageCaptions112 svc_tii_recall_top10 0.4875
  SimpleImageCaptions112 svc_itt_recall_top1 0.2765
  SimpleImageCaptions112 svc_itt_recall_top5 0.6514
  SimpleImageCaptions112 svc_itt_recall_top10 0.8017
  SimpleImageCaptions112 svc_iti_recall_top1 0.1513
  SimpleImageCaption

100%|██████████| 2586/2586 [04:05<00:00, 10.55it/s]


It 95682 epoch 37 Elapsed time 288.8718
    svc combine loss 2 0.0725
    total training loss 1.1493
    extractor loss 1 0.8889
    joint_embedding 0.1879


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4287
  CocoCapTest text2image_recall_top1 0.3264
  SimpleImageCaptions112 text2image_recall_top1 0.3063
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3958
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7719
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9058
  SimpleImageCaptions112 svc_tti_recall_top1 0.2434
  SimpleImageCaptions112 svc_tti_recall_top5 0.5711
  SimpleImageCaptions112 svc_tti_recall_top10 0.7216
  SimpleImageCaptions112 svc_tit_recall_top1 0.1955
  SimpleImageCaptions112 svc_tit_recall_top5 0.4987
  SimpleImageCaptions112 svc_tit_recall_top10 0.6354
  SimpleImageCaptions112 svc_tii_recall_top1 0.1194
  SimpleImageCaptions112 svc_tii_recall_top5 0.336
  SimpleImageCaptions112 svc_tii_recall_top10 0.4713
  SimpleImageCaptions112 svc_itt_recall_top1 0.2689
  SimpleImageCaptions112 svc_itt_recall_top5 0.6552
  SimpleImageCaptions112 svc_itt_recall_top10 0.7996
  SimpleImageCaptions112 svc_iti_recall_top1 0.1666
  SimpleImageCaption

100%|██████████| 2586/2586 [04:03<00:00, 10.76it/s]


It 98268 epoch 38 Elapsed time 287.0614
    svc combine loss 2 0.0724
    total training loss 1.1551
    extractor loss 1 0.893
    joint_embedding 0.1898


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4403
  CocoCapTest text2image_recall_top1 0.3625
  SimpleImageCaptions112 text2image_recall_top1 0.3179
  SimpleImageCaptions112 svc_ttt_recall_top1 0.476
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8306
  SimpleImageCaptions112 svc_ttt_recall_top10 0.953
  SimpleImageCaptions112 svc_tti_recall_top1 0.2393
  SimpleImageCaptions112 svc_tti_recall_top5 0.5439
  SimpleImageCaptions112 svc_tti_recall_top10 0.6894
  SimpleImageCaptions112 svc_tit_recall_top1 0.2146
  SimpleImageCaptions112 svc_tit_recall_top5 0.5387
  SimpleImageCaptions112 svc_tit_recall_top10 0.6971
  SimpleImageCaptions112 svc_tii_recall_top1 0.1286
  SimpleImageCaptions112 svc_tii_recall_top5 0.3526
  SimpleImageCaptions112 svc_tii_recall_top10 0.488
  SimpleImageCaptions112 svc_itt_recall_top1 0.2814
  SimpleImageCaptions112 svc_itt_recall_top5 0.6392
  SimpleImageCaptions112 svc_itt_recall_top10 0.7832
  SimpleImageCaptions112 svc_iti_recall_top1 0.1562
  SimpleImageCaptions1

100%|██████████| 2586/2586 [04:04<00:00, 10.56it/s]


It 100854 epoch 39 Elapsed time 287.7389
    svc combine loss 2 0.0734
    total training loss 1.1492
    extractor loss 1 0.886
    joint_embedding 0.1898


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4369
  CocoCapTest text2image_recall_top1 0.3441
  SimpleImageCaptions112 text2image_recall_top1 0.4277
  SimpleImageCaptions112 svc_ttt_recall_top1 0.4101
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7753
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9324
  SimpleImageCaptions112 svc_tti_recall_top1 0.2402
  SimpleImageCaptions112 svc_tti_recall_top5 0.542
  SimpleImageCaptions112 svc_tti_recall_top10 0.6911
  SimpleImageCaptions112 svc_tit_recall_top1 0.1946
  SimpleImageCaptions112 svc_tit_recall_top5 0.5069
  SimpleImageCaptions112 svc_tit_recall_top10 0.6605
  SimpleImageCaptions112 svc_tii_recall_top1 0.1241
  SimpleImageCaptions112 svc_tii_recall_top5 0.3399
  SimpleImageCaptions112 svc_tii_recall_top10 0.4765
  SimpleImageCaptions112 svc_itt_recall_top1 0.2783
  SimpleImageCaptions112 svc_itt_recall_top5 0.6308
  SimpleImageCaptions112 svc_itt_recall_top10 0.7742
  SimpleImageCaptions112 svc_iti_recall_top1 0.1621
  SimpleImageCaption

100%|██████████| 2586/2586 [04:04<00:00, 10.57it/s]


It 103440 epoch 40 Elapsed time 286.9686
    svc combine loss 2 0.0677
    total training loss 1.1343
    extractor loss 1 0.8777
    joint_embedding 0.1888


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4147
  CocoCapTest text2image_recall_top1 0.3472
  SimpleImageCaptions112 text2image_recall_top1 0.3643
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3561
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8754
  SimpleImageCaptions112 svc_ttt_recall_top10 0.924
  SimpleImageCaptions112 svc_tti_recall_top1 0.2355
  SimpleImageCaptions112 svc_tti_recall_top5 0.5394
  SimpleImageCaptions112 svc_tti_recall_top10 0.6822
  SimpleImageCaptions112 svc_tit_recall_top1 0.1959
  SimpleImageCaptions112 svc_tit_recall_top5 0.4956
  SimpleImageCaptions112 svc_tit_recall_top10 0.6477
  SimpleImageCaptions112 svc_tii_recall_top1 0.1191
  SimpleImageCaptions112 svc_tii_recall_top5 0.3333
  SimpleImageCaptions112 svc_tii_recall_top10 0.4647
  SimpleImageCaptions112 svc_itt_recall_top1 0.2862
  SimpleImageCaptions112 svc_itt_recall_top5 0.6334
  SimpleImageCaptions112 svc_itt_recall_top10 0.7749
  SimpleImageCaptions112 svc_iti_recall_top1 0.1573
  SimpleImageCaption

100%|██████████| 2586/2586 [04:07<00:00, 10.85it/s]


It 106026 epoch 41 Elapsed time 287.9341
    svc combine loss 2 0.0679
    total training loss 1.125
    extractor loss 1 0.8751
    joint_embedding 0.182


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4334
  CocoCapTest text2image_recall_top1 0.3219
  SimpleImageCaptions112 text2image_recall_top1 0.3312
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3498
  SimpleImageCaptions112 svc_ttt_recall_top5 0.743
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9064
  SimpleImageCaptions112 svc_tti_recall_top1 0.2206
  SimpleImageCaptions112 svc_tti_recall_top5 0.5337
  SimpleImageCaptions112 svc_tti_recall_top10 0.6802
  SimpleImageCaptions112 svc_tit_recall_top1 0.2069
  SimpleImageCaptions112 svc_tit_recall_top5 0.5198
  SimpleImageCaptions112 svc_tit_recall_top10 0.6704
  SimpleImageCaptions112 svc_tii_recall_top1 0.118
  SimpleImageCaptions112 svc_tii_recall_top5 0.3365
  SimpleImageCaptions112 svc_tii_recall_top10 0.4803
  SimpleImageCaptions112 svc_itt_recall_top1 0.2915
  SimpleImageCaptions112 svc_itt_recall_top5 0.6517
  SimpleImageCaptions112 svc_itt_recall_top10 0.7923
  SimpleImageCaptions112 svc_iti_recall_top1 0.1616
  SimpleImageCaptions

100%|██████████| 2586/2586 [04:02<00:00, 10.84it/s]


It 108612 epoch 42 Elapsed time 284.3229
    svc combine loss 2 0.0695
    total training loss 1.1142
    extractor loss 1 0.8632
    joint_embedding 0.1815


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.437
  CocoCapTest text2image_recall_top1 0.3285
  SimpleImageCaptions112 text2image_recall_top1 0.317
  SimpleImageCaptions112 svc_ttt_recall_top1 0.385
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8188
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9097
  SimpleImageCaptions112 svc_tti_recall_top1 0.2658
  SimpleImageCaptions112 svc_tti_recall_top5 0.5957
  SimpleImageCaptions112 svc_tti_recall_top10 0.7371
  SimpleImageCaptions112 svc_tit_recall_top1 0.199
  SimpleImageCaptions112 svc_tit_recall_top5 0.4991
  SimpleImageCaptions112 svc_tit_recall_top10 0.6591
  SimpleImageCaptions112 svc_tii_recall_top1 0.1235
  SimpleImageCaptions112 svc_tii_recall_top5 0.3468
  SimpleImageCaptions112 svc_tii_recall_top10 0.489
  SimpleImageCaptions112 svc_itt_recall_top1 0.2967
  SimpleImageCaptions112 svc_itt_recall_top5 0.6772
  SimpleImageCaptions112 svc_itt_recall_top10 0.8123
  SimpleImageCaptions112 svc_iti_recall_top1 0.1699
  SimpleImageCaptions112

100%|██████████| 2586/2586 [04:04<00:00, 10.60it/s]


It 111198 epoch 43 Elapsed time 284.9225
    svc combine loss 2 0.066
    total training loss 1.1171
    extractor loss 1 0.869
    joint_embedding 0.1821


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4299
  CocoCapTest text2image_recall_top1 0.346
  SimpleImageCaptions112 text2image_recall_top1 0.4259
  SimpleImageCaptions112 svc_ttt_recall_top1 0.4028
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8414
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9324
  SimpleImageCaptions112 svc_tti_recall_top1 0.2448
  SimpleImageCaptions112 svc_tti_recall_top5 0.5431
  SimpleImageCaptions112 svc_tti_recall_top10 0.6858
  SimpleImageCaptions112 svc_tit_recall_top1 0.1952
  SimpleImageCaptions112 svc_tit_recall_top5 0.5101
  SimpleImageCaptions112 svc_tit_recall_top10 0.663
  SimpleImageCaptions112 svc_tii_recall_top1 0.1269
  SimpleImageCaptions112 svc_tii_recall_top5 0.3478
  SimpleImageCaptions112 svc_tii_recall_top10 0.4857
  SimpleImageCaptions112 svc_itt_recall_top1 0.2908
  SimpleImageCaptions112 svc_itt_recall_top5 0.6533
  SimpleImageCaptions112 svc_itt_recall_top10 0.789
  SimpleImageCaptions112 svc_iti_recall_top1 0.1609
  SimpleImageCaptions1

100%|██████████| 2586/2586 [04:09<00:00, 10.38it/s]


It 113784 epoch 44 Elapsed time 290.3549
    svc combine loss 2 0.0684
    total training loss 1.1135
    extractor loss 1 0.865
    joint_embedding 0.1802


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.441
  CocoCapTest text2image_recall_top1 0.3518
  SimpleImageCaptions112 text2image_recall_top1 0.3366
  SimpleImageCaptions112 svc_ttt_recall_top1 0.4211
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8257
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9288
  SimpleImageCaptions112 svc_tti_recall_top1 0.2657
  SimpleImageCaptions112 svc_tti_recall_top5 0.5982
  SimpleImageCaptions112 svc_tti_recall_top10 0.7326
  SimpleImageCaptions112 svc_tit_recall_top1 0.2131
  SimpleImageCaptions112 svc_tit_recall_top5 0.5074
  SimpleImageCaptions112 svc_tit_recall_top10 0.6611
  SimpleImageCaptions112 svc_tii_recall_top1 0.1268
  SimpleImageCaptions112 svc_tii_recall_top5 0.3439
  SimpleImageCaptions112 svc_tii_recall_top10 0.4819
  SimpleImageCaptions112 svc_itt_recall_top1 0.2988
  SimpleImageCaptions112 svc_itt_recall_top5 0.6736
  SimpleImageCaptions112 svc_itt_recall_top10 0.8161
  SimpleImageCaptions112 svc_iti_recall_top1 0.1704
  SimpleImageCaption

100%|██████████| 2586/2586 [04:07<00:00, 11.06it/s]


It 116370 epoch 45 Elapsed time 290.1816
    svc combine loss 2 0.066
    total training loss 1.1206
    extractor loss 1 0.8748
    joint_embedding 0.1797


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4256
  CocoCapTest text2image_recall_top1 0.3548
  SimpleImageCaptions112 text2image_recall_top1 0.2929
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3327
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8077
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9087
  SimpleImageCaptions112 svc_tti_recall_top1 0.2332
  SimpleImageCaptions112 svc_tti_recall_top5 0.5607
  SimpleImageCaptions112 svc_tti_recall_top10 0.7062
  SimpleImageCaptions112 svc_tit_recall_top1 0.1987
  SimpleImageCaptions112 svc_tit_recall_top5 0.5255
  SimpleImageCaptions112 svc_tit_recall_top10 0.6751
  SimpleImageCaptions112 svc_tii_recall_top1 0.1252
  SimpleImageCaptions112 svc_tii_recall_top5 0.3462
  SimpleImageCaptions112 svc_tii_recall_top10 0.4806
  SimpleImageCaptions112 svc_itt_recall_top1 0.2854
  SimpleImageCaptions112 svc_itt_recall_top5 0.6495
  SimpleImageCaptions112 svc_itt_recall_top10 0.7966
  SimpleImageCaptions112 svc_iti_recall_top1 0.1664
  SimpleImageCaptio

100%|██████████| 2586/2586 [04:07<00:00, 10.44it/s]


It 118956 epoch 46 Elapsed time 291.0723
    svc combine loss 2 0.071
    total training loss 1.1252
    extractor loss 1 0.8728
    joint_embedding 0.1813


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.446
  CocoCapTest text2image_recall_top1 0.3431
  SimpleImageCaptions112 text2image_recall_top1 0.3652
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3541
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8097
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9113
  SimpleImageCaptions112 svc_tti_recall_top1 0.2466
  SimpleImageCaptions112 svc_tti_recall_top5 0.5539
  SimpleImageCaptions112 svc_tti_recall_top10 0.6933
  SimpleImageCaptions112 svc_tit_recall_top1 0.2027
  SimpleImageCaptions112 svc_tit_recall_top5 0.4934
  SimpleImageCaptions112 svc_tit_recall_top10 0.6419
  SimpleImageCaptions112 svc_tii_recall_top1 0.1211
  SimpleImageCaptions112 svc_tii_recall_top5 0.3336
  SimpleImageCaptions112 svc_tii_recall_top10 0.4646
  SimpleImageCaptions112 svc_itt_recall_top1 0.2857
  SimpleImageCaptions112 svc_itt_recall_top5 0.6717
  SimpleImageCaptions112 svc_itt_recall_top10 0.8092
  SimpleImageCaptions112 svc_iti_recall_top1 0.1659
  SimpleImageCaption

100%|██████████| 2586/2586 [04:02<00:00, 10.67it/s]


It 121542 epoch 47 Elapsed time 286.0153
    svc combine loss 2 0.066
    total training loss 1.1153
    extractor loss 1 0.8708
    joint_embedding 0.1785


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4288
  CocoCapTest text2image_recall_top1 0.3285
  SimpleImageCaptions112 text2image_recall_top1 0.3107
  SimpleImageCaptions112 svc_ttt_recall_top1 0.4468
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8336
  SimpleImageCaptions112 svc_ttt_recall_top10 0.9074
  SimpleImageCaptions112 svc_tti_recall_top1 0.2493
  SimpleImageCaptions112 svc_tti_recall_top5 0.5774
  SimpleImageCaptions112 svc_tti_recall_top10 0.72
  SimpleImageCaptions112 svc_tit_recall_top1 0.1941
  SimpleImageCaptions112 svc_tit_recall_top5 0.4963
  SimpleImageCaptions112 svc_tit_recall_top10 0.6436
  SimpleImageCaptions112 svc_tii_recall_top1 0.1186
  SimpleImageCaptions112 svc_tii_recall_top5 0.3358
  SimpleImageCaptions112 svc_tii_recall_top10 0.4636
  SimpleImageCaptions112 svc_itt_recall_top1 0.2898
  SimpleImageCaptions112 svc_itt_recall_top5 0.6607
  SimpleImageCaptions112 svc_itt_recall_top10 0.8064
  SimpleImageCaptions112 svc_iti_recall_top1 0.161
  SimpleImageCaptions1

100%|██████████| 2586/2586 [04:07<00:00, 10.45it/s]


It 124128 epoch 48 Elapsed time 290.5747
    svc combine loss 2 0.0675
    total training loss 1.1184
    extractor loss 1 0.8707
    joint_embedding 0.1802


  0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.4281
  CocoCapTest text2image_recall_top1 0.3399
  SimpleImageCaptions112 text2image_recall_top1 0.3152
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3874
  SimpleImageCaptions112 svc_ttt_recall_top5 0.7779
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8879
  SimpleImageCaptions112 svc_tti_recall_top1 0.2213
  SimpleImageCaptions112 svc_tti_recall_top5 0.5099
  SimpleImageCaptions112 svc_tti_recall_top10 0.6492
  SimpleImageCaptions112 svc_tit_recall_top1 0.1766
  SimpleImageCaptions112 svc_tit_recall_top5 0.4697
  SimpleImageCaptions112 svc_tit_recall_top10 0.6244
  SimpleImageCaptions112 svc_tii_recall_top1 0.1199
  SimpleImageCaptions112 svc_tii_recall_top5 0.3343
  SimpleImageCaptions112 svc_tii_recall_top10 0.4653
  SimpleImageCaptions112 svc_itt_recall_top1 0.2733
  SimpleImageCaptions112 svc_itt_recall_top5 0.6261
  SimpleImageCaptions112 svc_itt_recall_top10 0.7724
  SimpleImageCaptions112 svc_iti_recall_top1 0.1533
  SimpleImageCaptio

100%|██████████| 2586/2586 [04:06<00:00, 10.16it/s]


It 126714 epoch 49 Elapsed time 288.3528
    svc combine loss 2 0.0654
    total training loss 1.1191
    extractor loss 1 0.8722
    joint_embedding 0.1815
  CocoCapTrain text2image_recall_top1 0.4247
  CocoCapTest text2image_recall_top1 0.3495
  SimpleImageCaptions112 text2image_recall_top1 0.3384
  SimpleImageCaptions112 svc_ttt_recall_top1 0.3533
  SimpleImageCaptions112 svc_ttt_recall_top5 0.8042
  SimpleImageCaptions112 svc_ttt_recall_top10 0.8889
  SimpleImageCaptions112 svc_tti_recall_top1 0.2411
  SimpleImageCaptions112 svc_tti_recall_top5 0.533
  SimpleImageCaptions112 svc_tti_recall_top10 0.6779
  SimpleImageCaptions112 svc_tit_recall_top1 0.2074
  SimpleImageCaptions112 svc_tit_recall_top5 0.506
  SimpleImageCaptions112 svc_tit_recall_top10 0.6578
  SimpleImageCaptions112 svc_tii_recall_top1 0.1266
  SimpleImageCaptions112 svc_tii_recall_top5 0.3381
  SimpleImageCaptions112 svc_tii_recall_top10 0.4731
  SimpleImageCaptions112 svc_itt_recall_top1 0.2846
  SimpleImageCaptions

In [13]:
assert(False)

AssertionError: 

In [None]:
trainset.imgs[52234]

In [None]:
zip([1,2,3], [4, 5, 9, 10])

In [None]:
np.random.permutation(123)

In [None]:
########################################################################

In [None]:
model.eval()
img_features = []
img_labels = []
text_features = []
text_labels = []
for data in testset.get_loader(batch_size = opt.batch_size, shuffle = False, drop_last= False):
    # extract image features
    imgs = np.stack([d['image'] for d in data])
    imgs = torch.from_numpy(imgs).float()
    if len(imgs.shape) == 2:
        imgs = model.img_encoder.fc(imgs.cuda())
    else:
        imgs = model.img_encoder(imgs.cuda())
    imgs = model.snorm(imgs).cpu().detach().numpy()
    img_features += [imgs]
    img_labels += [d['label'] for d in data]

    # text
    texts = []
    for d in data:
        texts += d['captions']
        text_labels += [d['label'] for c in d['captions']]
    texts = model.text_encoder(texts)
    texts = model.snorm(texts).cpu().detach().numpy()
    text_features += [texts]

img_features = np.concatenate(img_features, axis=0)
text_features = np.concatenate(text_features, axis=0)

In [None]:
def show_test_img(i):
    f = testset.imgs[i]['filename']
    plt.imshow(torchvision.datasets.folder.pil_loader(f))
    plt.show()

In [None]:
i = np.random.randint(0, len(testset.imgs))
show_test_img(i)

j = np.random.randint(0, len(testset.imgs))
show_test_img(j)

k = np.random.randint(0, len(testset.imgs))
show_test_img(k)



In [None]:
x = img_features[i:(i+1),:]
x = model.subject_extractor(torch.from_numpy(x).cuda())
# x = model.text_encoder(['a woman'])

y = img_features[j:(j+1),:]
y = model.verb_extractor(torch.from_numpy(y).cuda())
#y = model.text_encoder(['playing tennis'])

z = img_features[k:(k+1),:]
z = model.context_extractor(torch.from_numpy(z).cuda())
z = model.text_encoder(['on the beach'])

x = model.svc_combine([x,y,z])
x = x.cpu().detach().numpy()

In [None]:
sim = x.dot(img_features.T)
s = -sim[0,:]
s = np.argsort(s)
for h in s[:10]:
    show_test_img(h)