In [1]:
import sys
import json
sys.path.append('./../')
sys.path.append('./../tirg/')
from main import *

In [2]:
opt = parse_opt() 
opt.batch_size = 32
opt.coco_path = '../../../../datasets/coco'
opt.sic112_path = '../../../../datasets/SIC112/'

logger = SummaryWriter(comment = opt.comment)

trainset, _, sic112 = load_datasets(opt)

17919 745
sucessfully loaded features
sucessfully loaded features
sucessfully loaded features


In [3]:
# add subject, verb and location annotations to SIC112
for img in sic112.imgs:
    img['subjects'] = [img['captions'][0].split()[0]]
    if img['captions'][0].split()[1].endswith("ing"):
        img['verbs'] = [img['captions'][0].split()[1]]
        img['locations'] = [' '.join(img['captions'][0].split()[2:])]
    else:
        img['verbs'] = []
        img['locations'] = [' '.join(img['captions'][0].split()[1:])]

In [4]:
# add subject, verb and location annotations to coco train 2014
# (need 'coco_splitted_captions_train2014.json' preprocess_coco first)
id2img = {}
for img in trainset.imgs:
    id2img[img['id']] = img
    img['subjects'] = []
    img['verbs'] = []
    img['locations'] = []
for caption in tqdm(json.load(open('coco_splitted_captions_train2014.json', 'rt'))['annotations']):
    img = id2img[caption['image_id']]
    if caption['subject_phrase'] is not None:
        img['subjects'] += [caption['subject_phrase']]
    if caption['verb_phrase'] is not None:
        img['verbs'] += [caption['verb_phrase']]
    if caption['location_phrase'] is not None:
        img['locations'] += [caption['location_phrase']]
        
# update trainset.__getitem__
#trainset.old_get = trainset.__getitem__
#def new_get(self, idx):
#    item = self.old_get(idx)
#type(trainset).__getitem__ = type(type(trainset).__getitem__)(new_get, trainset, type(trainset))

100%|██████████| 414113/414113 [00:01<00:00, 290800.53it/s]


In [5]:
class One2OneTransformation(torch.nn.Module):
    def __init__(self):
        super(One2OneTransformation, self).__init__()
        embed_dim = opt.embed_dim
        self.m = torch.nn.Sequential(
            torch.nn.Linear(embed_dim * 1, embed_dim * 2),
            torch.nn.ReLU(),
            torch.nn.Linear(embed_dim * 2, embed_dim * 2),
            torch.nn.BatchNorm1d(embed_dim * 2),
            torch.nn.ReLU(),
            torch.nn.Linear(embed_dim * 2, embed_dim)
        )
        self.norm = torch_functions.NormalizationLayer(learn_scale=False)

    def forward(self, x):
        f = self.norm(x)
        f = self.m(f)
        return f
    
class Three2OneTransformation(torch.nn.Module):
    def __init__(self):
        super(Three2OneTransformation, self).__init__()
        embed_dim = opt.embed_dim
        self.m = torch.nn.Sequential(
            torch.nn.Linear(embed_dim * 3, embed_dim * 5),
            torch.nn.ReLU(),
            torch.nn.Linear(embed_dim * 5, embed_dim * 5),
            torch.nn.BatchNorm1d(embed_dim * 5),
            torch.nn.ReLU(),
            torch.nn.Linear(embed_dim * 5, embed_dim)
        )
        self.norm = torch_functions.NormalizationLayer(learn_scale=False)

    def forward(self, x):
        f = torch.cat([self.norm(i) for i in x], dim=1)
        f = self.m(f)
        return f

model = create_model(opt, trainset)
model.subject_extractor = One2OneTransformation()
model.verb_extractor = One2OneTransformation()
model.location_extractor = One2OneTransformation()
model.svl_combine = Three2OneTransformation() 
model = model.cuda()
optimizer = create_optimizer(opt, model)

In [6]:
def test_svl(model, testset, opt):
    model = model.eval()

    # all img features
    img_features = []
    for data in testset.get_loader(batch_size = opt.batch_size, shuffle = False, drop_last= False):
        # extract image features
        imgs = np.stack([d['image'] for d in data])
        imgs = torch.from_numpy(imgs).float()
        if len(imgs.shape) == 2:
            imgs = model.img_encoder.fc(imgs.cuda())
        else:
            imgs = model.img_encoder(imgs.cuda())
        imgs = model.snorm(imgs).cpu().detach().numpy()
        img_features += [imgs]

    img_features = np.concatenate(img_features, axis=0)
    img_labels = [img['captions'][0] for img in testset.imgs]
    
    # construct random queries
    queries = []
    np.random.seed(123)
    for _ in range(5):
      for img in testset.imgs:
        if len(img['verbs']) == 0:
            continue
        while True:
            i = np.random.randint(0, len(testset.imgs))
            if img['subjects'][0] == testset.imgs[i]['subjects'][0] and img is not testset.imgs[i]:
                break
        while True:
            j = np.random.randint(0, len(testset.imgs))
            if len(testset.imgs[j]['verbs']) == 0:
                continue
            if img['verbs'][0] == testset.imgs[j]['verbs'][0] and img is not testset.imgs[j]:
                break
        while True:
            k = np.random.randint(0, len(testset.imgs))
            if img['locations'][0] == testset.imgs[k]['locations'][0] and img is not testset.imgs[k]:
                break
            
        
        queries += [{
            'subject_img_id': i,
            'verb_img_id': j,
            'location_img_id': k,
            'subject': img['subjects'][0],
            'verb': testset.imgs[j]['verbs'][0],
            'location': img['locations'][0],
            'label': img['captions'][0]
        }]
        
    #----
    #----
    r = []
    query_setting_combinations = []
    for s in ['t', 'i']:
        for v in ['t', 'i']:
            for l in ['t', 'i']:
                query_setting_combinations += [(s, v, l)]
    for s, v, l in query_setting_combinations:
        # compute query features
        query_features = []
        query_labels = []
        for i in range(0, len(queries), opt.batch_size):
            if s == 'i':
                subjects = model.subject_extractor(torch.from_numpy(
                    img_features[[q['subject_img_id'] for q in queries[i:(i+opt.batch_size)]],:]
                ).cuda())
            else:
                subjects = model.text_encoder([q['subject'] for q in queries[i:(i+opt.batch_size)]])
            if v == 'i':
                verbs = model.verb_extractor(torch.from_numpy(
                    img_features[[q['verb_img_id'] for q in queries[i:(i+opt.batch_size)]],:]
                ).cuda())
            else:
                verbs = model.text_encoder([q['verb'] for q in queries[i:(i+opt.batch_size)]])
            if l == 'i':
                locations = model.location_extractor(torch.from_numpy(
                    img_features[[q['location_img_id'] for q in queries[i:(i+opt.batch_size)]],:]
                ).cuda())
            else:
                locations = model.text_encoder([q['location'] for q in queries[i:(i+opt.batch_size)]])
            svl = model.svl_combine([subjects, verbs, locations])
            svl = svl.cpu().detach().numpy()
            query_features += [svl]
            query_labels += [q['label'] for q in queries[i:(i+opt.batch_size)]]

        query_features = np.concatenate(query_features, axis=0)

        # compute recall
        def measure_retrieval_performance(query_features, name = 'X'):
            sims = query_features.dot(img_features.T)
            sims = sims
            for k in [1, 5, 10]:
                r1 = 0.0
                r1_novel = 0.0
                count_novel = 0.0
                r1_nonnovel = 0.0
                count_nonnovel = 0.0
                for i in range(sims.shape[0]):
                    novel_query = False
                    if queries[i]['label'].split()[0] in ['trex', 'stormtrooper', 'darthvader', 'chewbacca']:
                        novel_query = True
                    if novel_query:
                        count_novel += 1
                    else:
                        count_nonnovel += 1
                        
                    s = -sims[i,:]
                    s = np.argsort(s)
                    if query_labels[i] in [img_labels[s[j]] for j in range(k)]:
                        r1 += 1
                        if novel_query:
                            r1_novel += 1
                        else:
                            r1_nonnovel += 1
                        
                r1 /= sims.shape[0]
                r.append(('svl_' + name + '_recall_top' + str(k), r1))
            return r
        measure_retrieval_performance(query_features, name = s + v + l)
    return r

def test(model, testset, opt):
    n = 1100
    if len(testset) < 10000:
        n = len(testset)
    r = test_text_to_image_retrieval(model, testset, opt, n)
    if '112' in testset.name():
        r += test_svl(model, testset, opt)
    return r

In [7]:
def compute_losses(model, data, losses_tracking, add_extract_compose_losses = True):
    losses = []

    # joint embedding loss
    imgs = np.stack([d['image'] for d in data])
    imgs = torch.from_numpy(imgs).float()
    if len(imgs.shape) == 2:
        imgs = model.img_encoder.fc(imgs.cuda())
    else:
         imgs = model.img_encoder(imgs.cuda())
    texts = [random.choice(d['captions']) for d in data]
    texts = model.text_encoder(texts)
    loss_name = 'joint_embedding'
    loss_weight = 1
    loss_value = model.pair_loss(texts, imgs).cuda()
    losses += [(loss_name, loss_weight, loss_value)]
    
    def do_add_extract_compose_losses():
        try:
            subjects = [random.choice(trainset.imgs[d['index']]['subjects']) for d in data]
            verbs = [random.choice(trainset.imgs[d['index']]['verbs']) for d in data]
            locations = [random.choice(trainset.imgs[d['index']]['locations']) for d in data]
        except:
            return
        encoded_subjects = model.text_encoder(subjects).detach()
        encoded_verbs = model.text_encoder(verbs).detach()
        encoded_locations = model.text_encoder(locations).detach()
        extracted_subjects = model.subject_extractor(random.choice([texts, imgs]).detach())
        extracted_verbs = model.verb_extractor(random.choice([texts, imgs]).detach())
        extracted_location = model.location_extractor(random.choice([texts, imgs]).detach())
            
        # extract
        loss_value = 0
        loss_value += model.pair_loss(
            torch.cat([extracted_subjects, extracted_verbs, extracted_location]),
            torch.cat([encoded_subjects, encoded_verbs, encoded_locations])
        ).cuda()
        loss_name = 'extract'
        loss_weight = 1
        losses.append((loss_name, loss_weight, loss_value))
        
        # compose with encoded
        loss_value = model.pair_loss(
            model.svl_combine([encoded_subjects, encoded_verbs, encoded_locations]),
            random.choice([imgs, model.text_encoder([s + ' ' + v + ' ' + l for s, v, l in zip(subjects, verbs, locations)])]).detach()
        ).cuda()
        loss_name = 'compose1'
        loss_weight = 0.5
        losses.append((loss_name, loss_weight, loss_value))

        # shuffle
        shuffled_subjects_indices = range(len(data))
        shuffled_verbs_indices = range(len(data))
        shuffled_locations_indices = range(len(data))
        random.shuffle(shuffled_subjects_indices)
        random.shuffle(shuffled_verbs_indices)
        random.shuffle(shuffled_locations_indices)
        encoded_subjects = encoded_subjects[shuffled_subjects_indices,:]
        encoded_verbs = encoded_verbs[shuffled_verbs_indices,:]
        encoded_locations = encoded_locations[shuffled_locations_indices,:]
        extracted_subjects = extracted_subjects[shuffled_subjects_indices,:]
        extracted_verbs = extracted_verbs[shuffled_verbs_indices,:]
        extracted_location = extracted_location[shuffled_locations_indices,:]
        subjects = np.array(subjects)[shuffled_subjects_indices]
        verbs = np.array(verbs)[shuffled_verbs_indices]
        locations = np.array(locations)[shuffled_locations_indices]

        # compose with extracted
        loss_value = model.pair_loss(
            model.svl_combine([extracted_subjects, extracted_verbs, extracted_location]),
            model.text_encoder([s + ' ' + v + ' ' + l for s, v, l in zip(subjects, verbs, locations)]).detach()
        ).cuda()
        loss_name = 'compose2'
        loss_weight = 0.5
        losses.append((loss_name, loss_weight, loss_value))
    if add_extract_compose_losses:
        do_add_extract_compose_losses()

    # total loss
    total_loss = sum([loss_weight * loss_value for loss_name, loss_weight, loss_value in losses])
    assert(not torch.isnan(total_loss))
    losses += [('total training loss', None, total_loss)]

    # save losses
    for loss_name, loss_weight, loss_value in losses:
        if not losses_tracking.has_key(loss_name):
            losses_tracking[loss_name] = []
        losses_tracking[loss_name].append(float(loss_value.data.item()))
    return total_loss

def train_1_epoch(model, optimizer, trainset, opt, losses_tracking, add_extract_compose_losses = True):
    model.train()
    loader = trainset.get_loader(
        batch_size=opt.batch_size, shuffle=True,
        drop_last=True, num_workers=opt.loader_num_workers)
    for data in tqdm(loader, desc = 'training 1 epoch'):
        total_loss = compute_losses(model, data, losses_tracking, add_extract_compose_losses)
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

In [8]:
# train loop
losses_tracking = {}
epoch = 0
tic = time.time()
while True:

    # show stat, training losses
    print 'Epoch', epoch, 'Elapsed time', round(time.time() - tic, 4)
    tic = time.time()
    for loss_name in losses_tracking:
        avg_loss = np.mean(losses_tracking[loss_name][-250:])
        print '   ', loss_name, round(avg_loss, 4)
        logger.add_scalar(loss_name, avg_loss, epoch)

    # test
    tests = []
    for dataset in [trainset, sic112]:
        t = test(model, dataset, opt)
        tests += [(dataset.name() + ' ' + metric_name, metric_value) for metric_name, metric_value in t]
    for metric_name, metric_value in tests:
        print ' ', metric_name, round(metric_value, 4)
        logger.add_scalar(metric_name, metric_value, epoch)

    # train
    if epoch >= opt.num_epochs:
        break
    train_1_epoch(model, optimizer, trainset, opt, losses_tracking,
                  add_extract_compose_losses = epoch>=1)
    epoch += 1

    # learing rate scheduling
    if epoch % opt.learning_rate_decay_frequency == 0:
        for g in optimizer.param_groups:
            g['lr'] *= 0.1

Epoch 0 Elapsed time 0.0012


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.1289
  SimpleImageCaptions112 text2image_recall_top1 0.1578
  SimpleImageCaptions112 svl_ttt_recall_top1 0.0
  SimpleImageCaptions112 svl_ttt_recall_top5 0.0368
  SimpleImageCaptions112 svl_ttt_recall_top10 0.0614
  SimpleImageCaptions112 svl_tti_recall_top1 0.0
  SimpleImageCaptions112 svl_tti_recall_top5 0.0368
  SimpleImageCaptions112 svl_tti_recall_top10 0.0737
  SimpleImageCaptions112 svl_tit_recall_top1 0.0
  SimpleImageCaptions112 svl_tit_recall_top5 0.0246
  SimpleImageCaptions112 svl_tit_recall_top10 0.0614
  SimpleImageCaptions112 svl_tii_recall_top1 0.0
  SimpleImageCaptions112 svl_tii_recall_top5 0.0268
  SimpleImageCaptions112 svl_tii_recall_top10 0.0737
  SimpleImageCaptions112 svl_itt_recall_top1 0.0
  SimpleImageCaptions112 svl_itt_recall_top5 0.0246
  SimpleImageCaptions112 svl_itt_recall_top10 0.0562
  SimpleImageCaptions112 svl_iti_recall_top1 0.0
  SimpleImageCaptions112 svl_iti_recall_top5 0.0246
  SimpleImageCaptions112 svl_

training 1 epoch: 100%|██████████| 2586/2586 [02:53<00:00, 14.88it/s]


Epoch 2 Elapsed time 209.0682
    total training loss 3.1741
    extract 2.7595
    joint_embedding 0.6088
    compose2 1.453
    compose1 0.4708


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.1662
  SimpleImageCaptions112 text2image_recall_top1 0.2157
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2038
  SimpleImageCaptions112 svl_ttt_recall_top5 0.577
  SimpleImageCaptions112 svl_ttt_recall_top10 0.822
  SimpleImageCaptions112 svl_tti_recall_top1 0.1875
  SimpleImageCaptions112 svl_tti_recall_top5 0.4839
  SimpleImageCaptions112 svl_tti_recall_top10 0.6373
  SimpleImageCaptions112 svl_tit_recall_top1 0.1905
  SimpleImageCaptions112 svl_tit_recall_top5 0.5096
  SimpleImageCaptions112 svl_tit_recall_top10 0.6718
  SimpleImageCaptions112 svl_tii_recall_top1 0.1405
  SimpleImageCaptions112 svl_tii_recall_top5 0.3865
  SimpleImageCaptions112 svl_tii_recall_top10 0.5351
  SimpleImageCaptions112 svl_itt_recall_top1 0.1729
  SimpleImageCaptions112 svl_itt_recall_top5 0.5131
  SimpleImageCaptions112 svl_itt_recall_top10 0.6877
  SimpleImageCaptions112 svl_iti_recall_top1 0.1187
  SimpleImageCaptions112 svl_iti_recall_top5 0.3751
  SimpleImage

training 1 epoch: 100%|██████████| 2586/2586 [02:52<00:00, 14.95it/s]


Epoch 3 Elapsed time 208.6189
    total training loss 3.0037
    extract 2.6726
    joint_embedding 0.5718
    compose2 1.2877
    compose1 0.4675


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.1886
  SimpleImageCaptions112 text2image_recall_top1 0.3297
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2904
  SimpleImageCaptions112 svl_ttt_recall_top5 0.55
  SimpleImageCaptions112 svl_ttt_recall_top10 0.7986
  SimpleImageCaptions112 svl_tti_recall_top1 0.192
  SimpleImageCaptions112 svl_tti_recall_top5 0.5622
  SimpleImageCaptions112 svl_tti_recall_top10 0.7238
  SimpleImageCaptions112 svl_tit_recall_top1 0.1732
  SimpleImageCaptions112 svl_tit_recall_top5 0.4896
  SimpleImageCaptions112 svl_tit_recall_top10 0.6731
  SimpleImageCaptions112 svl_tii_recall_top1 0.1294
  SimpleImageCaptions112 svl_tii_recall_top5 0.4031
  SimpleImageCaptions112 svl_tii_recall_top10 0.5627
  SimpleImageCaptions112 svl_itt_recall_top1 0.2043
  SimpleImageCaptions112 svl_itt_recall_top5 0.5768
  SimpleImageCaptions112 svl_itt_recall_top10 0.7657
  SimpleImageCaptions112 svl_iti_recall_top1 0.1517
  SimpleImageCaptions112 svl_iti_recall_top5 0.4293
  SimpleImageC

training 1 epoch: 100%|██████████| 2586/2586 [02:53<00:00, 14.87it/s]


Epoch 4 Elapsed time 209.4395
    total training loss 3.0934
    extract 2.6032
    joint_embedding 0.524
    compose2 1.2618
    compose1 0.414


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2004
  SimpleImageCaptions112 text2image_recall_top1 0.1691
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2106
  SimpleImageCaptions112 svl_ttt_recall_top5 0.4899
  SimpleImageCaptions112 svl_ttt_recall_top10 0.7066
  SimpleImageCaptions112 svl_tti_recall_top1 0.172
  SimpleImageCaptions112 svl_tti_recall_top5 0.5272
  SimpleImageCaptions112 svl_tti_recall_top10 0.6998
  SimpleImageCaptions112 svl_tit_recall_top1 0.1461
  SimpleImageCaptions112 svl_tit_recall_top5 0.4684
  SimpleImageCaptions112 svl_tit_recall_top10 0.6397
  SimpleImageCaptions112 svl_tii_recall_top1 0.1395
  SimpleImageCaptions112 svl_tii_recall_top5 0.4335
  SimpleImageCaptions112 svl_tii_recall_top10 0.5982
  SimpleImageCaptions112 svl_itt_recall_top1 0.174
  SimpleImageCaptions112 svl_itt_recall_top5 0.4765
  SimpleImageCaptions112 svl_itt_recall_top10 0.6579
  SimpleImageCaptions112 svl_iti_recall_top1 0.1441
  SimpleImageCaptions112 svl_iti_recall_top5 0.4465
  SimpleImage

training 1 epoch: 100%|██████████| 2586/2586 [02:53<00:00, 14.89it/s]


Epoch 5 Elapsed time 209.5628
    total training loss 2.8595
    extract 2.554
    joint_embedding 0.4984
    compose2 1.1957
    compose1 0.3705


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.206
  SimpleImageCaptions112 text2image_recall_top1 0.2633
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2406
  SimpleImageCaptions112 svl_ttt_recall_top5 0.531
  SimpleImageCaptions112 svl_ttt_recall_top10 0.7041
  SimpleImageCaptions112 svl_tti_recall_top1 0.1734
  SimpleImageCaptions112 svl_tti_recall_top5 0.4758
  SimpleImageCaptions112 svl_tti_recall_top10 0.62
  SimpleImageCaptions112 svl_tit_recall_top1 0.1831
  SimpleImageCaptions112 svl_tit_recall_top5 0.5413
  SimpleImageCaptions112 svl_tit_recall_top10 0.713
  SimpleImageCaptions112 svl_tii_recall_top1 0.1366
  SimpleImageCaptions112 svl_tii_recall_top5 0.4114
  SimpleImageCaptions112 svl_tii_recall_top10 0.5644
  SimpleImageCaptions112 svl_itt_recall_top1 0.201
  SimpleImageCaptions112 svl_itt_recall_top5 0.5763
  SimpleImageCaptions112 svl_itt_recall_top10 0.7466
  SimpleImageCaptions112 svl_iti_recall_top1 0.1721
  SimpleImageCaptions112 svl_iti_recall_top5 0.4618
  SimpleImageCapt

training 1 epoch: 100%|██████████| 2586/2586 [02:53<00:00, 14.35it/s]


Epoch 6 Elapsed time 209.7755
    total training loss 2.6107
    extract 2.5112
    joint_embedding 0.47
    compose2 1.1457
    compose1 0.3663


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2217
  SimpleImageCaptions112 text2image_recall_top1 0.2946
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2775
  SimpleImageCaptions112 svl_ttt_recall_top5 0.6532
  SimpleImageCaptions112 svl_ttt_recall_top10 0.7772
  SimpleImageCaptions112 svl_tti_recall_top1 0.1961
  SimpleImageCaptions112 svl_tti_recall_top5 0.5599
  SimpleImageCaptions112 svl_tti_recall_top10 0.7271
  SimpleImageCaptions112 svl_tit_recall_top1 0.2201
  SimpleImageCaptions112 svl_tit_recall_top5 0.5542
  SimpleImageCaptions112 svl_tit_recall_top10 0.7139
  SimpleImageCaptions112 svl_tii_recall_top1 0.1375
  SimpleImageCaptions112 svl_tii_recall_top5 0.3951
  SimpleImageCaptions112 svl_tii_recall_top10 0.5612
  SimpleImageCaptions112 svl_itt_recall_top1 0.2474
  SimpleImageCaptions112 svl_itt_recall_top5 0.6295
  SimpleImageCaptions112 svl_itt_recall_top10 0.8009
  SimpleImageCaptions112 svl_iti_recall_top1 0.1767
  SimpleImageCaptions112 svl_iti_recall_top5 0.4868
  SimpleIma

training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.85it/s]


Epoch 7 Elapsed time 210.0035
    total training loss 2.734
    extract 2.4719
    joint_embedding 0.4289
    compose2 1.1028
    compose1 0.3556


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2442
  SimpleImageCaptions112 text2image_recall_top1 0.3893
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2505
  SimpleImageCaptions112 svl_ttt_recall_top5 0.5936
  SimpleImageCaptions112 svl_ttt_recall_top10 0.8478
  SimpleImageCaptions112 svl_tti_recall_top1 0.2091
  SimpleImageCaptions112 svl_tti_recall_top5 0.5288
  SimpleImageCaptions112 svl_tti_recall_top10 0.677
  SimpleImageCaptions112 svl_tit_recall_top1 0.2131
  SimpleImageCaptions112 svl_tit_recall_top5 0.5597
  SimpleImageCaptions112 svl_tit_recall_top10 0.719
  SimpleImageCaptions112 svl_tii_recall_top1 0.1467
  SimpleImageCaptions112 svl_tii_recall_top5 0.4354
  SimpleImageCaptions112 svl_tii_recall_top10 0.5926
  SimpleImageCaptions112 svl_itt_recall_top1 0.2123
  SimpleImageCaptions112 svl_itt_recall_top5 0.6304
  SimpleImageCaptions112 svl_itt_recall_top10 0.794
  SimpleImageCaptions112 svl_iti_recall_top1 0.1832
  SimpleImageCaptions112 svl_iti_recall_top5 0.5037
  SimpleImageC

training 1 epoch: 100%|██████████| 2586/2586 [02:53<00:00, 14.87it/s]


Epoch 8 Elapsed time 209.4456
    total training loss 2.4256
    extract 2.4631
    joint_embedding 0.4171
    compose2 1.0581
    compose1 0.3422


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2442
  SimpleImageCaptions112 text2image_recall_top1 0.2628
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3413
  SimpleImageCaptions112 svl_ttt_recall_top5 0.7452
  SimpleImageCaptions112 svl_ttt_recall_top10 0.9091
  SimpleImageCaptions112 svl_tti_recall_top1 0.2451
  SimpleImageCaptions112 svl_tti_recall_top5 0.6638
  SimpleImageCaptions112 svl_tti_recall_top10 0.8194
  SimpleImageCaptions112 svl_tit_recall_top1 0.2359
  SimpleImageCaptions112 svl_tit_recall_top5 0.5799
  SimpleImageCaptions112 svl_tit_recall_top10 0.7382
  SimpleImageCaptions112 svl_tii_recall_top1 0.1526
  SimpleImageCaptions112 svl_tii_recall_top5 0.4511
  SimpleImageCaptions112 svl_tii_recall_top10 0.6
  SimpleImageCaptions112 svl_itt_recall_top1 0.2033
  SimpleImageCaptions112 svl_itt_recall_top5 0.6297
  SimpleImageCaptions112 svl_itt_recall_top10 0.7831
  SimpleImageCaptions112 svl_iti_recall_top1 0.1867
  SimpleImageCaptions112 svl_iti_recall_top5 0.5181
  SimpleImageC

training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.85it/s]


Epoch 9 Elapsed time 210.0769
    total training loss 2.3714
    extract 2.4432
    joint_embedding 0.4277
    compose2 1.066
    compose1 0.3157


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2488
  SimpleImageCaptions112 text2image_recall_top1 0.2633
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2603
  SimpleImageCaptions112 svl_ttt_recall_top5 0.6955
  SimpleImageCaptions112 svl_ttt_recall_top10 0.8177
  SimpleImageCaptions112 svl_tti_recall_top1 0.2708
  SimpleImageCaptions112 svl_tti_recall_top5 0.6172
  SimpleImageCaptions112 svl_tti_recall_top10 0.7788
  SimpleImageCaptions112 svl_tit_recall_top1 0.243
  SimpleImageCaptions112 svl_tit_recall_top5 0.5888
  SimpleImageCaptions112 svl_tit_recall_top10 0.7521
  SimpleImageCaptions112 svl_tii_recall_top1 0.2031
  SimpleImageCaptions112 svl_tii_recall_top5 0.5197
  SimpleImageCaptions112 svl_tii_recall_top10 0.685
  SimpleImageCaptions112 svl_itt_recall_top1 0.2099
  SimpleImageCaptions112 svl_itt_recall_top5 0.5871
  SimpleImageCaptions112 svl_itt_recall_top10 0.7641
  SimpleImageCaptions112 svl_iti_recall_top1 0.1928
  SimpleImageCaptions112 svl_iti_recall_top5 0.5206
  SimpleImage

training 1 epoch: 100%|██████████| 2586/2586 [02:53<00:00, 14.92it/s]


Epoch 10 Elapsed time 209.0978
    total training loss 2.4619
    extract 2.4193
    joint_embedding 0.4006
    compose2 1.0369
    compose1 0.3297


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2541
  SimpleImageCaptions112 text2image_recall_top1 0.3405
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2449
  SimpleImageCaptions112 svl_ttt_recall_top5 0.6808
  SimpleImageCaptions112 svl_ttt_recall_top10 0.8496
  SimpleImageCaptions112 svl_tti_recall_top1 0.24
  SimpleImageCaptions112 svl_tti_recall_top5 0.6176
  SimpleImageCaptions112 svl_tti_recall_top10 0.764
  SimpleImageCaptions112 svl_tit_recall_top1 0.2185
  SimpleImageCaptions112 svl_tit_recall_top5 0.5605
  SimpleImageCaptions112 svl_tit_recall_top10 0.7045
  SimpleImageCaptions112 svl_tii_recall_top1 0.175
  SimpleImageCaptions112 svl_tii_recall_top5 0.4831
  SimpleImageCaptions112 svl_tii_recall_top10 0.6349
  SimpleImageCaptions112 svl_itt_recall_top1 0.2296
  SimpleImageCaptions112 svl_itt_recall_top5 0.6053
  SimpleImageCaptions112 svl_itt_recall_top10 0.7624
  SimpleImageCaptions112 svl_iti_recall_top1 0.2037
  SimpleImageCaptions112 svl_iti_recall_top5 0.5236
  SimpleImageCa

training 1 epoch: 100%|██████████| 2586/2586 [02:53<00:00, 14.98it/s]


Epoch 11 Elapsed time 209.565
    total training loss 2.4806
    extract 2.389
    joint_embedding 0.4055
    compose2 0.982
    compose1 0.3106


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.269
  SimpleImageCaptions112 text2image_recall_top1 0.3292
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2971
  SimpleImageCaptions112 svl_ttt_recall_top5 0.7262
  SimpleImageCaptions112 svl_ttt_recall_top10 0.8895
  SimpleImageCaptions112 svl_tti_recall_top1 0.2435
  SimpleImageCaptions112 svl_tti_recall_top5 0.6002
  SimpleImageCaptions112 svl_tti_recall_top10 0.7608
  SimpleImageCaptions112 svl_tit_recall_top1 0.2316
  SimpleImageCaptions112 svl_tit_recall_top5 0.5778
  SimpleImageCaptions112 svl_tit_recall_top10 0.7272
  SimpleImageCaptions112 svl_tii_recall_top1 0.1682
  SimpleImageCaptions112 svl_tii_recall_top5 0.4593
  SimpleImageCaptions112 svl_tii_recall_top10 0.6097
  SimpleImageCaptions112 svl_itt_recall_top1 0.2866
  SimpleImageCaptions112 svl_itt_recall_top5 0.6686
  SimpleImageCaptions112 svl_itt_recall_top10 0.8108
  SimpleImageCaptions112 svl_iti_recall_top1 0.2016
  SimpleImageCaptions112 svl_iti_recall_top5 0.5385
  SimpleImag

training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.85it/s]


Epoch 12 Elapsed time 210.0538
    total training loss 2.2738
    extract 2.4088
    joint_embedding 0.4079
    compose2 1.0094
    compose1 0.3215


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2565
  SimpleImageCaptions112 text2image_recall_top1 0.2832
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2081
  SimpleImageCaptions112 svl_ttt_recall_top5 0.5709
  SimpleImageCaptions112 svl_ttt_recall_top10 0.8748
  SimpleImageCaptions112 svl_tti_recall_top1 0.1763
  SimpleImageCaptions112 svl_tti_recall_top5 0.4904
  SimpleImageCaptions112 svl_tti_recall_top10 0.6658
  SimpleImageCaptions112 svl_tit_recall_top1 0.2004
  SimpleImageCaptions112 svl_tit_recall_top5 0.5705
  SimpleImageCaptions112 svl_tit_recall_top10 0.7402
  SimpleImageCaptions112 svl_tii_recall_top1 0.1713
  SimpleImageCaptions112 svl_tii_recall_top5 0.4641
  SimpleImageCaptions112 svl_tii_recall_top10 0.6158
  SimpleImageCaptions112 svl_itt_recall_top1 0.2459
  SimpleImageCaptions112 svl_itt_recall_top5 0.6367
  SimpleImageCaptions112 svl_itt_recall_top10 0.7896
  SimpleImageCaptions112 svl_iti_recall_top1 0.2058
  SimpleImageCaptions112 svl_iti_recall_top5 0.5301
  SimpleIma

training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.83it/s]


Epoch 13 Elapsed time 210.0805
    total training loss 2.4414
    extract 2.3864
    joint_embedding 0.3963
    compose2 0.9924
    compose1 0.2753


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2709
  SimpleImageCaptions112 text2image_recall_top1 0.2764
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3112
  SimpleImageCaptions112 svl_ttt_recall_top5 0.6771
  SimpleImageCaptions112 svl_ttt_recall_top10 0.8441
  SimpleImageCaptions112 svl_tti_recall_top1 0.2352
  SimpleImageCaptions112 svl_tti_recall_top5 0.5886
  SimpleImageCaptions112 svl_tti_recall_top10 0.7462
  SimpleImageCaptions112 svl_tit_recall_top1 0.234
  SimpleImageCaptions112 svl_tit_recall_top5 0.5937
  SimpleImageCaptions112 svl_tit_recall_top10 0.7505
  SimpleImageCaptions112 svl_tii_recall_top1 0.1718
  SimpleImageCaptions112 svl_tii_recall_top5 0.4829
  SimpleImageCaptions112 svl_tii_recall_top10 0.6414
  SimpleImageCaptions112 svl_itt_recall_top1 0.2657
  SimpleImageCaptions112 svl_itt_recall_top5 0.6368
  SimpleImageCaptions112 svl_itt_recall_top10 0.8009
  SimpleImageCaptions112 svl_iti_recall_top1 0.192
  SimpleImageCaptions112 svl_iti_recall_top5 0.517
  SimpleImageC

training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.82it/s]


Epoch 14 Elapsed time 210.3731
    total training loss 2.4901
    extract 2.3692
    joint_embedding 0.3778
    compose2 0.9787
    compose1 0.2924


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2599
  SimpleImageCaptions112 text2image_recall_top1 0.2571
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2284
  SimpleImageCaptions112 svl_ttt_recall_top5 0.7624
  SimpleImageCaptions112 svl_ttt_recall_top10 0.8674
  SimpleImageCaptions112 svl_tti_recall_top1 0.2087
  SimpleImageCaptions112 svl_tti_recall_top5 0.5627
  SimpleImageCaptions112 svl_tti_recall_top10 0.7289
  SimpleImageCaptions112 svl_tit_recall_top1 0.2394
  SimpleImageCaptions112 svl_tit_recall_top5 0.6178
  SimpleImageCaptions112 svl_tit_recall_top10 0.7683
  SimpleImageCaptions112 svl_tii_recall_top1 0.1853
  SimpleImageCaptions112 svl_tii_recall_top5 0.5045
  SimpleImageCaptions112 svl_tii_recall_top10 0.6587
  SimpleImageCaptions112 svl_itt_recall_top1 0.2495
  SimpleImageCaptions112 svl_itt_recall_top5 0.6452
  SimpleImageCaptions112 svl_itt_recall_top10 0.8028
  SimpleImageCaptions112 svl_iti_recall_top1 0.1942
  SimpleImageCaptions112 svl_iti_recall_top5 0.5239
  SimpleIma

training 1 epoch: 100%|██████████| 2586/2586 [02:55<00:00, 15.13it/s]


Epoch 15 Elapsed time 211.3697
    total training loss 2.4288
    extract 2.3744
    joint_embedding 0.3757
    compose2 0.9635
    compose1 0.3096


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2674
  SimpleImageCaptions112 text2image_recall_top1 0.2452
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2891
  SimpleImageCaptions112 svl_ttt_recall_top5 0.7858
  SimpleImageCaptions112 svl_ttt_recall_top10 0.8913
  SimpleImageCaptions112 svl_tti_recall_top1 0.2287
  SimpleImageCaptions112 svl_tti_recall_top5 0.5998
  SimpleImageCaptions112 svl_tti_recall_top10 0.7589
  SimpleImageCaptions112 svl_tit_recall_top1 0.2258
  SimpleImageCaptions112 svl_tit_recall_top5 0.5924
  SimpleImageCaptions112 svl_tit_recall_top10 0.7484
  SimpleImageCaptions112 svl_tii_recall_top1 0.1775
  SimpleImageCaptions112 svl_tii_recall_top5 0.4815
  SimpleImageCaptions112 svl_tii_recall_top10 0.6366
  SimpleImageCaptions112 svl_itt_recall_top1 0.2388
  SimpleImageCaptions112 svl_itt_recall_top5 0.6066
  SimpleImageCaptions112 svl_itt_recall_top10 0.7599
  SimpleImageCaptions112 svl_iti_recall_top1 0.1896
  SimpleImageCaptions112 svl_iti_recall_top5 0.5046
  SimpleIma

training 1 epoch: 100%|██████████| 2586/2586 [03:15<00:00, 14.62it/s]


Epoch 16 Elapsed time 234.5081
    total training loss 2.4525
    extract 2.3718
    joint_embedding 0.3752
    compose2 0.9771
    compose1 0.298


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2682
  SimpleImageCaptions112 text2image_recall_top1 0.3417
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3407
  SimpleImageCaptions112 svl_ttt_recall_top5 0.6262
  SimpleImageCaptions112 svl_ttt_recall_top10 0.8386
  SimpleImageCaptions112 svl_tti_recall_top1 0.2473
  SimpleImageCaptions112 svl_tti_recall_top5 0.6103
  SimpleImageCaptions112 svl_tti_recall_top10 0.7681
  SimpleImageCaptions112 svl_tit_recall_top1 0.2393
  SimpleImageCaptions112 svl_tit_recall_top5 0.5743
  SimpleImageCaptions112 svl_tit_recall_top10 0.7401
  SimpleImageCaptions112 svl_tii_recall_top1 0.1769
  SimpleImageCaptions112 svl_tii_recall_top5 0.4906
  SimpleImageCaptions112 svl_tii_recall_top10 0.6421
  SimpleImageCaptions112 svl_itt_recall_top1 0.2708
  SimpleImageCaptions112 svl_itt_recall_top5 0.6309
  SimpleImageCaptions112 svl_itt_recall_top10 0.7756
  SimpleImageCaptions112 svl_iti_recall_top1 0.2047
  SimpleImageCaptions112 svl_iti_recall_top5 0.5443
  SimpleIma

training 1 epoch: 100%|██████████| 2586/2586 [03:52<00:00, 11.11it/s]


Epoch 17 Elapsed time 281.0886
    total training loss 2.4128
    extract 2.3737
    joint_embedding 0.3774
    compose2 0.9867
    compose1 0.2697


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2743
  SimpleImageCaptions112 text2image_recall_top1 0.3241
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2333
  SimpleImageCaptions112 svl_ttt_recall_top5 0.6998
  SimpleImageCaptions112 svl_ttt_recall_top10 0.7974
  SimpleImageCaptions112 svl_tti_recall_top1 0.2022
  SimpleImageCaptions112 svl_tti_recall_top5 0.5371
  SimpleImageCaptions112 svl_tti_recall_top10 0.698
  SimpleImageCaptions112 svl_tit_recall_top1 0.2275
  SimpleImageCaptions112 svl_tit_recall_top5 0.5421
  SimpleImageCaptions112 svl_tit_recall_top10 0.6954
  SimpleImageCaptions112 svl_tii_recall_top1 0.1767
  SimpleImageCaptions112 svl_tii_recall_top5 0.4566
  SimpleImageCaptions112 svl_tii_recall_top10 0.6081
  SimpleImageCaptions112 svl_itt_recall_top1 0.2546
  SimpleImageCaptions112 svl_itt_recall_top5 0.6066
  SimpleImageCaptions112 svl_itt_recall_top10 0.7557
  SimpleImageCaptions112 svl_iti_recall_top1 0.2106
  SimpleImageCaptions112 svl_iti_recall_top5 0.5375
  SimpleImag

training 1 epoch: 100%|██████████| 2586/2586 [02:55<00:00, 14.72it/s]


Epoch 18 Elapsed time 211.3846
    total training loss 2.4167
    extract 2.3539
    joint_embedding 0.3611
    compose2 0.9492
    compose1 0.2573


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2829
  SimpleImageCaptions112 text2image_recall_top1 0.3059
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3419
  SimpleImageCaptions112 svl_ttt_recall_top5 0.7465
  SimpleImageCaptions112 svl_ttt_recall_top10 0.9036
  SimpleImageCaptions112 svl_tti_recall_top1 0.2366
  SimpleImageCaptions112 svl_tti_recall_top5 0.6182
  SimpleImageCaptions112 svl_tti_recall_top10 0.7627
  SimpleImageCaptions112 svl_tit_recall_top1 0.2502
  SimpleImageCaptions112 svl_tit_recall_top5 0.6276
  SimpleImageCaptions112 svl_tit_recall_top10 0.7804
  SimpleImageCaptions112 svl_tii_recall_top1 0.1926
  SimpleImageCaptions112 svl_tii_recall_top5 0.4928
  SimpleImageCaptions112 svl_tii_recall_top10 0.65
  SimpleImageCaptions112 svl_itt_recall_top1 0.2236
  SimpleImageCaptions112 svl_itt_recall_top5 0.6044
  SimpleImageCaptions112 svl_itt_recall_top10 0.7546
  SimpleImageCaptions112 svl_iti_recall_top1 0.198
  SimpleImageCaptions112 svl_iti_recall_top5 0.5213
  SimpleImageC

training 1 epoch:  45%|████▌     | 1168/2586 [01:19<01:39, 14.27it/s]

KeyboardInterrupt: 