In [1]:
import sys
import json
sys.path.append('./../')
sys.path.append('./../tirg/')
from main import *

In [2]:
opt = parse_opt() 
opt.batch_size = 32
opt.coco_path = '../../../../datasets/coco'
opt.sic112_path = '../../../../datasets/SIC112/'

logger = SummaryWriter(comment = opt.comment)

trainset, _, sic112 = load_datasets(opt)

17919 745
sucessfully loaded features
sucessfully loaded features
sucessfully loaded features


In [3]:
# add subject, verb and location annotations to SIC112
for img in sic112.imgs:
    img['subjects'] = [img['captions'][0].split()[0]]
    if img['captions'][0].split()[1].endswith("ing"):
        img['verbs'] = [img['captions'][0].split()[1]]
        img['locations'] = [' '.join(img['captions'][0].split()[2:])]
    else:
        img['verbs'] = []
        img['locations'] = [' '.join(img['captions'][0].split()[1:])]

In [4]:
# add subject, verb and location annotations to coco train 2014
# (need 'coco_splitted_captions_train2014.json' preprocess_coco first)
id2img = {}
for img in trainset.imgs:
    id2img[img['id']] = img
    img['subjects'] = []
    img['verbs'] = []
    img['locations'] = []
for caption in tqdm(json.load(open('coco_splitted_captions_train2014.json', 'rt'))['annotations']):
    img = id2img[caption['image_id']]
    if caption['subject_phrase'] is not None:
        img['subjects'] += [caption['subject_phrase']]
    if caption['verb_phrase'] is not None:
        img['verbs'] += [caption['verb_phrase']]
    if caption['location_phrase'] is not None:
        img['locations'] += [caption['location_phrase']]
        
# update trainset.__getitem__
#trainset.old_get = trainset.__getitem__
#def new_get(self, idx):
#    item = self.old_get(idx)
#type(trainset).__getitem__ = type(type(trainset).__getitem__)(new_get, trainset, type(trainset))

100%|██████████| 414113/414113 [00:01<00:00, 250927.82it/s]


In [5]:
class One2OneTransformation(torch.nn.Module):
    def __init__(self):
        super(One2OneTransformation, self).__init__()
        embed_dim = opt.embed_dim
        self.m = torch.nn.Sequential(
            torch.nn.Linear(embed_dim * 1, embed_dim * 2),
            torch.nn.ReLU(),
            torch.nn.Linear(embed_dim * 2, embed_dim * 2),
            torch.nn.BatchNorm1d(embed_dim * 2),
            torch.nn.ReLU(),
            torch.nn.Linear(embed_dim * 2, embed_dim)
        )
        self.norm = torch_functions.NormalizationLayer(learn_scale=False)

    def forward(self, x):
        f = self.norm(x)
        f = self.m(f)
        return f
    
class Three2OneTransformation(torch.nn.Module):
    def __init__(self):
        super(Three2OneTransformation, self).__init__()
        embed_dim = opt.embed_dim
        self.m = torch.nn.Sequential(
            torch.nn.Linear(embed_dim * 3, embed_dim * 5),
            torch.nn.ReLU(),
            torch.nn.Linear(embed_dim * 5, embed_dim * 5),
            torch.nn.BatchNorm1d(embed_dim * 5),
            torch.nn.ReLU(),
            torch.nn.Linear(embed_dim * 5, embed_dim)
        )
        self.norm = torch_functions.NormalizationLayer(learn_scale=False)

    def forward(self, x):
        f = torch.cat([self.norm(i) for i in x], dim=1)
        f = self.m(f)
        return f

model = create_model(opt, trainset)
model.subject_extractor = One2OneTransformation()
model.verb_extractor = One2OneTransformation()
model.location_extractor = One2OneTransformation()
model.svl_combine = Three2OneTransformation() 
model = model.cuda()
optimizer = create_optimizer(opt, model)

In [6]:
def test_svl(model, testset, opt):
    model = model.eval()

    # all img features
    img_features = []
    for data in testset.get_loader(batch_size = opt.batch_size, shuffle = False, drop_last= False):
        # extract image features
        imgs = np.stack([d['image'] for d in data])
        imgs = torch.from_numpy(imgs).float()
        if len(imgs.shape) == 2:
            imgs = model.img_encoder.fc(imgs.cuda())
        else:
            imgs = model.img_encoder(imgs.cuda())
        imgs = model.snorm(imgs).cpu().detach().numpy()
        img_features += [imgs]

    img_features = np.concatenate(img_features, axis=0)
    img_labels = [img['captions'][0] for img in testset.imgs]
    
    # construct random queries
    queries = []
    np.random.seed(123)
    for _ in range(5):
      for img in testset.imgs:
        if len(img['verbs']) == 0:
            continue
        while True:
            i = np.random.randint(0, len(testset.imgs))
            if img['subjects'][0] == testset.imgs[i]['subjects'][0] and img is not testset.imgs[i]:
                break
        while True:
            j = np.random.randint(0, len(testset.imgs))
            if len(testset.imgs[j]['verbs']) == 0:
                continue
            if img['verbs'][0] == testset.imgs[j]['verbs'][0] and img is not testset.imgs[j]:
                break
        while True:
            k = np.random.randint(0, len(testset.imgs))
            if img['locations'][0] == testset.imgs[k]['locations'][0] and img is not testset.imgs[k]:
                break
            
        
        queries += [{
            'subject_img_id': i,
            'verb_img_id': j,
            'location_img_id': k,
            'subject': img['subjects'][0],
            'verb': testset.imgs[j]['verbs'][0],
            'location': img['locations'][0],
            'label': img['captions'][0]
        }]
        
    #----
    #----
    r = []
    query_setting_combinations = []
    for s in ['t', 'i']:
        for v in ['t', 'i']:
            for l in ['t', 'i']:
                query_setting_combinations += [(s, v, l)]
    for s, v, l in query_setting_combinations:
        # compute query features
        query_features = []
        query_labels = []
        for i in range(0, len(queries), opt.batch_size):
            if s == 'i':
                subjects = model.subject_extractor(torch.from_numpy(
                    img_features[[q['subject_img_id'] for q in queries[i:(i+opt.batch_size)]],:]
                ).cuda())
            else:
                subjects = model.text_encoder([q['subject'] for q in queries[i:(i+opt.batch_size)]])
            if v == 'i':
                verbs = model.verb_extractor(torch.from_numpy(
                    img_features[[q['verb_img_id'] for q in queries[i:(i+opt.batch_size)]],:]
                ).cuda())
            else:
                verbs = model.text_encoder([q['verb'] for q in queries[i:(i+opt.batch_size)]])
            if l == 'i':
                locations = model.location_extractor(torch.from_numpy(
                    img_features[[q['location_img_id'] for q in queries[i:(i+opt.batch_size)]],:]
                ).cuda())
            else:
                locations = model.text_encoder([q['location'] for q in queries[i:(i+opt.batch_size)]])
            svl = model.svl_combine([subjects, verbs, locations])
            svl = svl.cpu().detach().numpy()
            query_features += [svl]
            query_labels += [q['label'] for q in queries[i:(i+opt.batch_size)]]

        query_features = np.concatenate(query_features, axis=0)

        # compute recall
        def measure_retrieval_performance(query_features, name = 'X'):
            sims = query_features.dot(img_features.T)
            sims = sims
            for k in [1, 5, 10]:
                r1 = 0.0
                r1_novel = 0.0
                count_novel = 0.0
                r1_nonnovel = 0.0
                count_nonnovel = 0.0
                for i in range(sims.shape[0]):
                    novel_query = False
                    if queries[i]['label'].split()[0] in ['trex', 'stormtrooper', 'darthvader', 'chewbacca']:
                        novel_query = True
                    if novel_query:
                        count_novel += 1
                    else:
                        count_nonnovel += 1
                        
                    s = -sims[i,:]
                    s = np.argsort(s)
                    if query_labels[i] in [img_labels[s[j]] for j in range(k)]:
                        r1 += 1
                        if novel_query:
                            r1_novel += 1
                        else:
                            r1_nonnovel += 1
                        
                r1 /= sims.shape[0]
                r.append(('svl_' + name + '_recall_top' + str(k), r1))
            return r
        measure_retrieval_performance(query_features, name = s + v + l)
    return r

def test(model, testset, opt):
    n = 1100
    if len(testset) < 10000:
        n = len(testset)
    r = test_text_to_image_retrieval(model, testset, opt, n)
    if '112' in testset.name():
        r += test_svl(model, testset, opt)
    return r

In [7]:
def compute_losses(model, data, losses_tracking, add_extract_compose_losses = True):
    losses = []

    # joint embedding loss
    imgs = np.stack([d['image'] for d in data])
    imgs = torch.from_numpy(imgs).float()
    if len(imgs.shape) == 2:
        imgs = model.img_encoder.fc(imgs.cuda())
    else:
         imgs = model.img_encoder(imgs.cuda())
    texts = [random.choice(d['captions']) for d in data]
    texts = model.text_encoder(texts)
    loss_name = 'joint_embedding'
    loss_weight = 1
    loss_value = model.pair_loss(texts, imgs).cuda()
    losses += [(loss_name, loss_weight, loss_value)]
    
    def do_add_extract_compose_losses():
        try:
            subjects = [random.choice(trainset.imgs[d['index']]['subjects']) for d in data]
            verbs = [random.choice(trainset.imgs[d['index']]['verbs']) for d in data]
            locations = [random.choice(trainset.imgs[d['index']]['locations']) for d in data]
        except:
            return
        encoded_subjects = model.text_encoder(subjects).detach()
        encoded_verbs = model.text_encoder(verbs).detach()
        encoded_locations = model.text_encoder(locations).detach()
        extracted_subjects = model.subject_extractor(random.choice([texts, imgs]).detach())
        extracted_verbs = model.verb_extractor(random.choice([texts, imgs]).detach())
        extracted_location = model.location_extractor(random.choice([texts, imgs]).detach())
            
        # extract
        loss_value = 0
        loss_value += model.pair_loss(
            torch.cat([extracted_subjects, extracted_verbs, extracted_location]),
            torch.cat([encoded_subjects, encoded_verbs, encoded_locations])
        ).cuda()
        loss_name = 'extract'
        loss_weight = 1
        losses.append((loss_name, loss_weight, loss_value))
        
        # compose with encoded
        loss_value = model.pair_loss(
            model.svl_combine([encoded_subjects, encoded_verbs, encoded_locations]),
            random.choice([imgs, model.text_encoder([s + ' ' + v + ' ' + l for s, v, l in zip(subjects, verbs, locations)])]).detach()
        ).cuda()
        loss_name = 'compose1'
        loss_weight = 0.5
        losses.append((loss_name, loss_weight, loss_value))

        # shuffle
        shuffled_subjects_indices = range(len(data))
        shuffled_verbs_indices = range(len(data))
        shuffled_locations_indices = range(len(data))
        random.shuffle(shuffled_subjects_indices)
        random.shuffle(shuffled_verbs_indices)
        random.shuffle(shuffled_locations_indices)
        encoded_subjects = encoded_subjects[shuffled_subjects_indices,:]
        encoded_verbs = encoded_verbs[shuffled_verbs_indices,:]
        encoded_locations = encoded_locations[shuffled_locations_indices,:]
        extracted_subjects = extracted_subjects[shuffled_subjects_indices,:]
        extracted_verbs = extracted_verbs[shuffled_verbs_indices,:]
        extracted_location = extracted_location[shuffled_locations_indices,:]
        subjects = np.array(subjects)[shuffled_subjects_indices]
        verbs = np.array(verbs)[shuffled_verbs_indices]
        locations = np.array(locations)[shuffled_locations_indices]

        # compose with extracted
        loss_value = model.pair_loss(
            model.svl_combine([extracted_subjects, extracted_verbs, extracted_location]),
            model.text_encoder([s + ' ' + v + ' ' + l for s, v, l in zip(subjects, verbs, locations)]).detach()
        ).cuda()
        loss_name = 'compose2'
        loss_weight = 0.5
        losses.append((loss_name, loss_weight, loss_value))
    if add_extract_compose_losses:
        do_add_extract_compose_losses()

    # total loss
    total_loss = sum([loss_weight * loss_value for loss_name, loss_weight, loss_value in losses])
    assert(not torch.isnan(total_loss))
    losses += [('total training loss', None, total_loss)]

    # save losses
    for loss_name, loss_weight, loss_value in losses:
        if not losses_tracking.has_key(loss_name):
            losses_tracking[loss_name] = []
        losses_tracking[loss_name].append(float(loss_value.data.item()))
    return total_loss

def train_1_epoch(model, optimizer, trainset, opt, losses_tracking, add_extract_compose_losses = True):
    model.train()
    loader = trainset.get_loader(
        batch_size=opt.batch_size, shuffle=True,
        drop_last=True, num_workers=opt.loader_num_workers)
    for data in tqdm(loader, desc = 'training 1 epoch'):
        total_loss = compute_losses(model, data, losses_tracking, add_extract_compose_losses)
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

In [8]:
# train loop
losses_tracking = {}
epoch = 0
tic = time.time()
while True:

    # show stat, training losses
    print 'Epoch', epoch, 'Elapsed time', round(time.time() - tic, 4)
    tic = time.time()
    for loss_name in losses_tracking:
        avg_loss = np.mean(losses_tracking[loss_name][-250:])
        print '   ', loss_name, round(avg_loss, 4)
        logger.add_scalar(loss_name, avg_loss, epoch)

    # test
    tests = []
    for dataset in [trainset, sic112]:
        t = test(model, dataset, opt)
        tests += [(dataset.name() + ' ' + metric_name, metric_value) for metric_name, metric_value in t]
    for metric_name, metric_value in tests:
        print ' ', metric_name, round(metric_value, 4)
        logger.add_scalar(metric_name, metric_value, epoch)

    # train
    if epoch >= opt.num_epochs:
        break
    train_1_epoch(model, optimizer, trainset, opt, losses_tracking,
                  add_extract_compose_losses = epoch>=1)
    epoch += 1

    # learing rate scheduling
    if epoch % opt.learning_rate_decay_frequency == 0:
        for g in optimizer.param_groups:
            g['lr'] *= 0.1

Epoch 0 Elapsed time 0.0009


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.0011
  SimpleImageCaptions112 text2image_recall_top1 0.0227
  SimpleImageCaptions112 svl_ttt_recall_top1 0.0107
  SimpleImageCaptions112 svl_tti_recall_top1 0.0054
  SimpleImageCaptions112 svl_tit_recall_top1 0.0072
  SimpleImageCaptions112 svl_tii_recall_top1 0.0063
  SimpleImageCaptions112 svl_itt_recall_top1 0.0065
  SimpleImageCaptions112 svl_iti_recall_top1 0.006
  SimpleImageCaptions112 svl_iit_recall_top1 0.0056
  SimpleImageCaptions112 svl_iii_recall_top1 0.0053


training 1 epoch: 100%|██████████| 2586/2586 [01:16<00:00, 33.92it/s]


Epoch 1 Elapsed time 108.7479
    total training loss 0.7641
    joint_embedding 0.7641


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.1219
  SimpleImageCaptions112 text2image_recall_top1 0.1844
  SimpleImageCaptions112 svl_ttt_recall_top1 0.0014
  SimpleImageCaptions112 svl_tti_recall_top1 0.0011
  SimpleImageCaptions112 svl_tit_recall_top1 0.0
  SimpleImageCaptions112 svl_tii_recall_top1 0.0005
  SimpleImageCaptions112 svl_itt_recall_top1 0.0011
  SimpleImageCaptions112 svl_iti_recall_top1 0.0027
  SimpleImageCaptions112 svl_iit_recall_top1 0.0006
  SimpleImageCaptions112 svl_iii_recall_top1 0.0006


training 1 epoch: 100%|██████████| 2586/2586 [02:53<00:00, 14.93it/s]


Epoch 2 Elapsed time 206.0967
    total training loss 3.2498
    extract 2.775
    joint_embedding 0.6279
    compose2 1.4563
    compose1 0.4688


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.1565
  SimpleImageCaptions112 text2image_recall_top1 0.1305
  SimpleImageCaptions112 svl_ttt_recall_top1 0.1686
  SimpleImageCaptions112 svl_tti_recall_top1 0.1353
  SimpleImageCaptions112 svl_tit_recall_top1 0.1343
  SimpleImageCaptions112 svl_tii_recall_top1 0.1198
  SimpleImageCaptions112 svl_itt_recall_top1 0.1371
  SimpleImageCaptions112 svl_iti_recall_top1 0.1197
  SimpleImageCaptions112 svl_iit_recall_top1 0.1285
  SimpleImageCaptions112 svl_iii_recall_top1 0.1137


training 1 epoch: 100%|██████████| 2586/2586 [02:53<00:00, 14.87it/s]


Epoch 3 Elapsed time 206.2718
    total training loss 3.0355
    extract 2.6744
    joint_embedding 0.5563
    compose2 1.3228
    compose1 0.4397


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.1785
  SimpleImageCaptions112 text2image_recall_top1 0.2452
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2166
  SimpleImageCaptions112 svl_tti_recall_top1 0.147
  SimpleImageCaptions112 svl_tit_recall_top1 0.1961
  SimpleImageCaptions112 svl_tii_recall_top1 0.1368
  SimpleImageCaptions112 svl_itt_recall_top1 0.1952
  SimpleImageCaptions112 svl_iti_recall_top1 0.1319
  SimpleImageCaptions112 svl_iit_recall_top1 0.1516
  SimpleImageCaptions112 svl_iii_recall_top1 0.1145


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.88it/s]


Epoch 4 Elapsed time 207.8454
    total training loss 3.0256
    extract 2.5992
    joint_embedding 0.5099
    compose2 1.2393
    compose1 0.4318


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.1922
  SimpleImageCaptions112 text2image_recall_top1 0.1691
  SimpleImageCaptions112 svl_ttt_recall_top1 0.1508
  SimpleImageCaptions112 svl_tti_recall_top1 0.1526
  SimpleImageCaptions112 svl_tit_recall_top1 0.184
  SimpleImageCaptions112 svl_tii_recall_top1 0.1562
  SimpleImageCaptions112 svl_itt_recall_top1 0.1957
  SimpleImageCaptions112 svl_iti_recall_top1 0.1471
  SimpleImageCaptions112 svl_iit_recall_top1 0.1538
  SimpleImageCaptions112 svl_iii_recall_top1 0.1012


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 13.23it/s]


Epoch 5 Elapsed time 207.3423
    total training loss 2.9012
    extract 2.5637
    joint_embedding 0.4707
    compose2 1.2035
    compose1 0.3899


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.214
  SimpleImageCaptions112 text2image_recall_top1 0.2361
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2242
  SimpleImageCaptions112 svl_tti_recall_top1 0.2345
  SimpleImageCaptions112 svl_tit_recall_top1 0.1967
  SimpleImageCaptions112 svl_tii_recall_top1 0.1789
  SimpleImageCaptions112 svl_itt_recall_top1 0.1622
  SimpleImageCaptions112 svl_iti_recall_top1 0.1665
  SimpleImageCaptions112 svl_iit_recall_top1 0.167
  SimpleImageCaptions112 svl_iii_recall_top1 0.1229


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 16.68it/s]


Epoch 6 Elapsed time 208.184
    total training loss 2.6914
    extract 2.5051
    joint_embedding 0.4568
    compose2 1.1329
    compose1 0.3531


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2185
  SimpleImageCaptions112 text2image_recall_top1 0.2316
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2654
  SimpleImageCaptions112 svl_tti_recall_top1 0.2125
  SimpleImageCaptions112 svl_tit_recall_top1 0.1994
  SimpleImageCaptions112 svl_tii_recall_top1 0.1694
  SimpleImageCaptions112 svl_itt_recall_top1 0.2156
  SimpleImageCaptions112 svl_iti_recall_top1 0.1801
  SimpleImageCaptions112 svl_iit_recall_top1 0.1549
  SimpleImageCaptions112 svl_iii_recall_top1 0.1191


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.81it/s]


Epoch 7 Elapsed time 207.734
    total training loss 2.7476
    extract 2.5028
    joint_embedding 0.4452
    compose2 1.1216
    compose1 0.3252


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2265
  SimpleImageCaptions112 text2image_recall_top1 0.2599
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2103
  SimpleImageCaptions112 svl_tti_recall_top1 0.2048
  SimpleImageCaptions112 svl_tit_recall_top1 0.1871
  SimpleImageCaptions112 svl_tii_recall_top1 0.149
  SimpleImageCaptions112 svl_itt_recall_top1 0.2009
  SimpleImageCaptions112 svl_iti_recall_top1 0.1686
  SimpleImageCaptions112 svl_iit_recall_top1 0.1558
  SimpleImageCaptions112 svl_iii_recall_top1 0.1225


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 15.37it/s]


Epoch 8 Elapsed time 207.6855
    total training loss 2.8254
    extract 2.4708
    joint_embedding 0.426
    compose2 1.0893
    compose1 0.3241


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2399
  SimpleImageCaptions112 text2image_recall_top1 0.2594
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2048
  SimpleImageCaptions112 svl_tti_recall_top1 0.2005
  SimpleImageCaptions112 svl_tit_recall_top1 0.205
  SimpleImageCaptions112 svl_tii_recall_top1 0.1714
  SimpleImageCaptions112 svl_itt_recall_top1 0.2009
  SimpleImageCaptions112 svl_iti_recall_top1 0.1747
  SimpleImageCaptions112 svl_iit_recall_top1 0.174
  SimpleImageCaptions112 svl_iii_recall_top1 0.1435


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.33it/s]


Epoch 9 Elapsed time 207.3661
    total training loss 2.4369
    extract 2.4388
    joint_embedding 0.4047
    compose2 1.0423
    compose1 0.3112


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2442
  SimpleImageCaptions112 text2image_recall_top1 0.2594
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2882
  SimpleImageCaptions112 svl_tti_recall_top1 0.2281
  SimpleImageCaptions112 svl_tit_recall_top1 0.2066
  SimpleImageCaptions112 svl_tii_recall_top1 0.1678
  SimpleImageCaptions112 svl_itt_recall_top1 0.2253
  SimpleImageCaptions112 svl_iti_recall_top1 0.1808
  SimpleImageCaptions112 svl_iit_recall_top1 0.161
  SimpleImageCaptions112 svl_iii_recall_top1 0.1285


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.83it/s]


Epoch 10 Elapsed time 207.504
    total training loss 2.6795
    extract 2.4444
    joint_embedding 0.422
    compose2 1.0637
    compose1 0.35


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2429
  SimpleImageCaptions112 text2image_recall_top1 0.269
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2889
  SimpleImageCaptions112 svl_tti_recall_top1 0.2211
  SimpleImageCaptions112 svl_tit_recall_top1 0.216
  SimpleImageCaptions112 svl_tii_recall_top1 0.1713
  SimpleImageCaptions112 svl_itt_recall_top1 0.2343
  SimpleImageCaptions112 svl_iti_recall_top1 0.1877
  SimpleImageCaptions112 svl_iit_recall_top1 0.1746
  SimpleImageCaptions112 svl_iii_recall_top1 0.1327


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.78it/s]


Epoch 11 Elapsed time 207.9085
    total training loss 2.7965
    extract 2.439
    joint_embedding 0.4056
    compose2 1.0666
    compose1 0.3587


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2659
  SimpleImageCaptions112 text2image_recall_top1 0.2645
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2467
  SimpleImageCaptions112 svl_tti_recall_top1 0.2171
  SimpleImageCaptions112 svl_tit_recall_top1 0.2065
  SimpleImageCaptions112 svl_tii_recall_top1 0.1764
  SimpleImageCaptions112 svl_itt_recall_top1 0.2237
  SimpleImageCaptions112 svl_iti_recall_top1 0.1813
  SimpleImageCaptions112 svl_iit_recall_top1 0.1694
  SimpleImageCaptions112 svl_iii_recall_top1 0.1352


training 1 epoch: 100%|██████████| 2586/2586 [02:55<00:00, 14.77it/s]


Epoch 12 Elapsed time 208.2331
    total training loss 2.5605
    extract 2.3914
    joint_embedding 0.4017
    compose2 0.9793
    compose1 0.2956


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2583
  SimpleImageCaptions112 text2image_recall_top1 0.2418
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2805
  SimpleImageCaptions112 svl_tti_recall_top1 0.2346
  SimpleImageCaptions112 svl_tit_recall_top1 0.2074
  SimpleImageCaptions112 svl_tii_recall_top1 0.1745
  SimpleImageCaptions112 svl_itt_recall_top1 0.2581
  SimpleImageCaptions112 svl_iti_recall_top1 0.1946
  SimpleImageCaptions112 svl_iit_recall_top1 0.1666
  SimpleImageCaptions112 svl_iii_recall_top1 0.1384


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.57it/s]


Epoch 13 Elapsed time 207.6126
    total training loss 2.4485
    extract 2.4047
    joint_embedding 0.3831
    compose2 1.0061
    compose1 0.3269


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2588
  SimpleImageCaptions112 text2image_recall_top1 0.2656
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2792
  SimpleImageCaptions112 svl_tti_recall_top1 0.2392
  SimpleImageCaptions112 svl_tit_recall_top1 0.2171
  SimpleImageCaptions112 svl_tii_recall_top1 0.1683
  SimpleImageCaptions112 svl_itt_recall_top1 0.2323
  SimpleImageCaptions112 svl_iti_recall_top1 0.1999
  SimpleImageCaptions112 svl_iit_recall_top1 0.1805
  SimpleImageCaptions112 svl_iii_recall_top1 0.1364


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 15.27it/s]


Epoch 14 Elapsed time 208.462
    total training loss 2.4979
    extract 2.3823
    joint_embedding 0.3733
    compose2 0.9791
    compose1 0.3136


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2609
  SimpleImageCaptions112 text2image_recall_top1 0.3275
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3001
  SimpleImageCaptions112 svl_tti_recall_top1 0.2591
  SimpleImageCaptions112 svl_tit_recall_top1 0.244
  SimpleImageCaptions112 svl_tii_recall_top1 0.1833
  SimpleImageCaptions112 svl_itt_recall_top1 0.2398
  SimpleImageCaptions112 svl_iti_recall_top1 0.2002
  SimpleImageCaptions112 svl_iit_recall_top1 0.1796
  SimpleImageCaptions112 svl_iii_recall_top1 0.1418


training 1 epoch: 100%|██████████| 2586/2586 [02:55<00:00, 14.44it/s]


Epoch 15 Elapsed time 207.9579
    total training loss 2.4561
    extract 2.3449
    joint_embedding 0.3821
    compose2 0.9635
    compose1 0.2948


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2738
  SimpleImageCaptions112 text2image_recall_top1 0.3297
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2465
  SimpleImageCaptions112 svl_tti_recall_top1 0.2215
  SimpleImageCaptions112 svl_tit_recall_top1 0.2153
  SimpleImageCaptions112 svl_tii_recall_top1 0.1696
  SimpleImageCaptions112 svl_itt_recall_top1 0.2234
  SimpleImageCaptions112 svl_iti_recall_top1 0.1867
  SimpleImageCaptions112 svl_iit_recall_top1 0.1637
  SimpleImageCaptions112 svl_iii_recall_top1 0.1255


training 1 epoch: 100%|██████████| 2586/2586 [02:55<00:00, 14.19it/s]


Epoch 16 Elapsed time 208.2679
    total training loss 2.3487
    extract 2.3293
    joint_embedding 0.3731
    compose2 0.9599
    compose1 0.2476


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2625
  SimpleImageCaptions112 text2image_recall_top1 0.3246
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3328
  SimpleImageCaptions112 svl_tti_recall_top1 0.2301
  SimpleImageCaptions112 svl_tit_recall_top1 0.2471
  SimpleImageCaptions112 svl_tii_recall_top1 0.1875
  SimpleImageCaptions112 svl_itt_recall_top1 0.2277
  SimpleImageCaptions112 svl_iti_recall_top1 0.1837
  SimpleImageCaptions112 svl_iit_recall_top1 0.1736
  SimpleImageCaptions112 svl_iii_recall_top1 0.14


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 16.91it/s]


Epoch 18 Elapsed time 207.7347
    total training loss 2.4302
    extract 2.3416
    joint_embedding 0.3659
    compose2 0.9705
    compose1 0.2793


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2757
  SimpleImageCaptions112 text2image_recall_top1 0.3201
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3263
  SimpleImageCaptions112 svl_tti_recall_top1 0.2478
  SimpleImageCaptions112 svl_tit_recall_top1 0.2291
  SimpleImageCaptions112 svl_tii_recall_top1 0.1871
  SimpleImageCaptions112 svl_itt_recall_top1 0.225
  SimpleImageCaptions112 svl_iti_recall_top1 0.1905
  SimpleImageCaptions112 svl_iit_recall_top1 0.1618
  SimpleImageCaptions112 svl_iii_recall_top1 0.1355


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.81it/s]


Epoch 19 Elapsed time 207.5964
    total training loss 2.5365
    extract 2.3359
    joint_embedding 0.3673
    compose2 0.935
    compose1 0.2717


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.2724
  SimpleImageCaptions112 text2image_recall_top1 0.3365
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2985
  SimpleImageCaptions112 svl_tti_recall_top1 0.2473
  SimpleImageCaptions112 svl_tit_recall_top1 0.2322
  SimpleImageCaptions112 svl_tii_recall_top1 0.1806
  SimpleImageCaptions112 svl_itt_recall_top1 0.2285
  SimpleImageCaptions112 svl_iti_recall_top1 0.1979
  SimpleImageCaptions112 svl_iit_recall_top1 0.1619
  SimpleImageCaptions112 svl_iii_recall_top1 0.1406


training 1 epoch:  74%|███████▎  | 1902/2586 [02:09<00:48, 13.98it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

training 1 epoch: 100%|██████████| 2586/2586 [02:55<00:00, 13.52it/s]


Epoch 22 Elapsed time 208.4869
    total training loss 2.1445
    extract 2.244
    joint_embedding 0.2822
    compose2 0.802
    compose1 0.2452


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.322
  SimpleImageCaptions112 text2image_recall_top1 0.328
  SimpleImageCaptions112 svl_ttt_recall_top1 0.2977
  SimpleImageCaptions112 svl_tti_recall_top1 0.2635
  SimpleImageCaptions112 svl_tit_recall_top1 0.223
  SimpleImageCaptions112 svl_tii_recall_top1 0.1962
  SimpleImageCaptions112 svl_itt_recall_top1 0.2447
  SimpleImageCaptions112 svl_iti_recall_top1 0.2184
  SimpleImageCaptions112 svl_iit_recall_top1 0.2021
  SimpleImageCaptions112 svl_iii_recall_top1 0.1554


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.05it/s]


Epoch 23 Elapsed time 207.5677
    total training loss 2.1127
    extract 2.217
    joint_embedding 0.2899
    compose2 0.7953
    compose1 0.2239


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3207
  SimpleImageCaptions112 text2image_recall_top1 0.3309
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3389
  SimpleImageCaptions112 svl_tti_recall_top1 0.2832
  SimpleImageCaptions112 svl_tit_recall_top1 0.2362
  SimpleImageCaptions112 svl_tii_recall_top1 0.1973
  SimpleImageCaptions112 svl_itt_recall_top1 0.2408
  SimpleImageCaptions112 svl_iti_recall_top1 0.2306
  SimpleImageCaptions112 svl_iit_recall_top1 0.1966
  SimpleImageCaptions112 svl_iii_recall_top1 0.1557


training 1 epoch: 100%|██████████| 2586/2586 [02:55<00:00, 14.74it/s]


Epoch 24 Elapsed time 208.3516
    total training loss 2.0324
    extract 2.1968
    joint_embedding 0.2735
    compose2 0.7422
    compose1 0.2121


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3313
  SimpleImageCaptions112 text2image_recall_top1 0.3292
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3201
  SimpleImageCaptions112 svl_tti_recall_top1 0.2776
  SimpleImageCaptions112 svl_tit_recall_top1 0.2479
  SimpleImageCaptions112 svl_tii_recall_top1 0.1989
  SimpleImageCaptions112 svl_itt_recall_top1 0.251
  SimpleImageCaptions112 svl_iti_recall_top1 0.2183
  SimpleImageCaptions112 svl_iit_recall_top1 0.1972
  SimpleImageCaptions112 svl_iii_recall_top1 0.1524


training 1 epoch:  39%|███▉      | 1020/2586 [01:08<01:46, 14.69it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

training 1 epoch: 100%|██████████| 2586/2586 [02:55<00:00, 14.73it/s]


Epoch 27 Elapsed time 208.341
    total training loss 2.0006
    extract 2.1581
    joint_embedding 0.2547
    compose2 0.7237
    compose1 0.204


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3404
  SimpleImageCaptions112 text2image_recall_top1 0.3547
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3961
  SimpleImageCaptions112 svl_tti_recall_top1 0.3047
  SimpleImageCaptions112 svl_tit_recall_top1 0.2483
  SimpleImageCaptions112 svl_tii_recall_top1 0.208
  SimpleImageCaptions112 svl_itt_recall_top1 0.2681
  SimpleImageCaptions112 svl_iti_recall_top1 0.2296
  SimpleImageCaptions112 svl_iit_recall_top1 0.2045
  SimpleImageCaptions112 svl_iii_recall_top1 0.1591


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 17.09it/s]


Epoch 28 Elapsed time 207.5353
    total training loss 2.2235
    extract 2.175
    joint_embedding 0.2602
    compose2 0.7563
    compose1 0.2032


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.343
  SimpleImageCaptions112 text2image_recall_top1 0.3343
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3188
  SimpleImageCaptions112 svl_tti_recall_top1 0.281
  SimpleImageCaptions112 svl_tit_recall_top1 0.2459
  SimpleImageCaptions112 svl_tii_recall_top1 0.1996
  SimpleImageCaptions112 svl_itt_recall_top1 0.2583
  SimpleImageCaptions112 svl_iti_recall_top1 0.2183
  SimpleImageCaptions112 svl_iit_recall_top1 0.2011
  SimpleImageCaptions112 svl_iii_recall_top1 0.1558


training 1 epoch: 100%|██████████| 2586/2586 [02:55<00:00, 14.76it/s]


Epoch 29 Elapsed time 208.1862
    total training loss 2.0368
    extract 2.1871
    joint_embedding 0.2437
    compose2 0.7671
    compose1 0.2


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3437
  SimpleImageCaptions112 text2image_recall_top1 0.3933
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3558
  SimpleImageCaptions112 svl_tti_recall_top1 0.2901
  SimpleImageCaptions112 svl_tit_recall_top1 0.2473
  SimpleImageCaptions112 svl_tii_recall_top1 0.2021
  SimpleImageCaptions112 svl_itt_recall_top1 0.2519
  SimpleImageCaptions112 svl_iti_recall_top1 0.2254
  SimpleImageCaptions112 svl_iit_recall_top1 0.2039
  SimpleImageCaptions112 svl_iii_recall_top1 0.1568


training 1 epoch:   6%|▋         | 166/2586 [00:12<02:36, 15.46it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

training 1 epoch: 100%|██████████| 2586/2586 [02:55<00:00, 14.70it/s]


Epoch 32 Elapsed time 208.5339
    total training loss 2.0579
    extract 2.155
    joint_embedding 0.2651
    compose2 0.7322
    compose1 0.2135


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3536
  SimpleImageCaptions112 text2image_recall_top1 0.3201
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3441
  SimpleImageCaptions112 svl_tti_recall_top1 0.3206
  SimpleImageCaptions112 svl_tit_recall_top1 0.2571
  SimpleImageCaptions112 svl_tii_recall_top1 0.2156
  SimpleImageCaptions112 svl_itt_recall_top1 0.2624
  SimpleImageCaptions112 svl_iti_recall_top1 0.2219
  SimpleImageCaptions112 svl_iit_recall_top1 0.2037
  SimpleImageCaptions112 svl_iii_recall_top1 0.1612


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.78it/s]


Epoch 33 Elapsed time 207.8271
    total training loss 1.9647
    extract 2.1623
    joint_embedding 0.2319
    compose2 0.7491
    compose1 0.1847


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.357
  SimpleImageCaptions112 text2image_recall_top1 0.315
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3056
  SimpleImageCaptions112 svl_tti_recall_top1 0.2761
  SimpleImageCaptions112 svl_tit_recall_top1 0.2568
  SimpleImageCaptions112 svl_tii_recall_top1 0.218
  SimpleImageCaptions112 svl_itt_recall_top1 0.2761
  SimpleImageCaptions112 svl_iti_recall_top1 0.2193
  SimpleImageCaptions112 svl_iit_recall_top1 0.2118
  SimpleImageCaptions112 svl_iii_recall_top1 0.1664


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 15.77it/s]


Epoch 34 Elapsed time 207.6584
    total training loss 2.1855
    extract 2.1583
    joint_embedding 0.2502
    compose2 0.7202
    compose1 0.2115


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3589
  SimpleImageCaptions112 text2image_recall_top1 0.3712
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3068
  SimpleImageCaptions112 svl_tti_recall_top1 0.2796
  SimpleImageCaptions112 svl_tit_recall_top1 0.2444
  SimpleImageCaptions112 svl_tii_recall_top1 0.2033
  SimpleImageCaptions112 svl_itt_recall_top1 0.2708
  SimpleImageCaptions112 svl_iti_recall_top1 0.2104
  SimpleImageCaptions112 svl_iit_recall_top1 0.1988
  SimpleImageCaptions112 svl_iii_recall_top1 0.1572


training 1 epoch:  17%|█▋        | 432/2586 [00:30<02:32, 14.13it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

training 1 epoch: 100%|██████████| 2586/2586 [02:55<00:00, 14.74it/s]


Epoch 37 Elapsed time 208.9472
    total training loss 2.0658
    extract 2.1197
    joint_embedding 0.2355
    compose2 0.7165
    compose1 0.1923


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3602
  SimpleImageCaptions112 text2image_recall_top1 0.3638
  SimpleImageCaptions112 svl_ttt_recall_top1 0.315
  SimpleImageCaptions112 svl_tti_recall_top1 0.2668
  SimpleImageCaptions112 svl_tit_recall_top1 0.249
  SimpleImageCaptions112 svl_tii_recall_top1 0.2112
  SimpleImageCaptions112 svl_itt_recall_top1 0.2775
  SimpleImageCaptions112 svl_iti_recall_top1 0.2108
  SimpleImageCaptions112 svl_iit_recall_top1 0.2076
  SimpleImageCaptions112 svl_iii_recall_top1 0.1639


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.80it/s]


Epoch 38 Elapsed time 208.0926
    total training loss 2.0059
    extract 2.1406
    joint_embedding 0.2461
    compose2 0.7268
    compose1 0.2199


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3644
  SimpleImageCaptions112 text2image_recall_top1 0.3944
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3611
  SimpleImageCaptions112 svl_tti_recall_top1 0.2976
  SimpleImageCaptions112 svl_tit_recall_top1 0.263
  SimpleImageCaptions112 svl_tii_recall_top1 0.2201
  SimpleImageCaptions112 svl_itt_recall_top1 0.2665
  SimpleImageCaptions112 svl_iti_recall_top1 0.2172
  SimpleImageCaptions112 svl_iit_recall_top1 0.2018
  SimpleImageCaptions112 svl_iii_recall_top1 0.1567


training 1 epoch:  78%|███████▊  | 2012/2586 [02:16<00:38, 15.10it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

training 1 epoch: 100%|██████████| 2586/2586 [02:55<00:00, 14.73it/s]


Epoch 42 Elapsed time 208.4446
    total training loss 2.0243
    extract 2.1262
    joint_embedding 0.2299
    compose2 0.7044
    compose1 0.2036


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3682
  SimpleImageCaptions112 text2image_recall_top1 0.3734
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3301
  SimpleImageCaptions112 svl_tti_recall_top1 0.268
  SimpleImageCaptions112 svl_tit_recall_top1 0.2591
  SimpleImageCaptions112 svl_tii_recall_top1 0.2163
  SimpleImageCaptions112 svl_itt_recall_top1 0.2831
  SimpleImageCaptions112 svl_iti_recall_top1 0.2185
  SimpleImageCaptions112 svl_iit_recall_top1 0.2133
  SimpleImageCaptions112 svl_iii_recall_top1 0.1626


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 15.45it/s]


Epoch 43 Elapsed time 207.986
    total training loss 1.9928
    extract 2.1305
    joint_embedding 0.234
    compose2 0.7034
    compose1 0.1852


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3703
  SimpleImageCaptions112 text2image_recall_top1 0.3564
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3451
  SimpleImageCaptions112 svl_tti_recall_top1 0.275
  SimpleImageCaptions112 svl_tit_recall_top1 0.2533
  SimpleImageCaptions112 svl_tii_recall_top1 0.2138
  SimpleImageCaptions112 svl_itt_recall_top1 0.2793
  SimpleImageCaptions112 svl_iti_recall_top1 0.2179
  SimpleImageCaptions112 svl_iit_recall_top1 0.2036
  SimpleImageCaptions112 svl_iii_recall_top1 0.1622


training 1 epoch:  75%|███████▍  | 1927/2586 [02:11<00:40, 16.19it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 14.59it/s]


Epoch 47 Elapsed time 207.4148
    total training loss 2.0347
    extract 2.1444
    joint_embedding 0.2369
    compose2 0.7141
    compose1 0.1946


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3741
  SimpleImageCaptions112 text2image_recall_top1 0.3451
  SimpleImageCaptions112 svl_ttt_recall_top1 0.4033
  SimpleImageCaptions112 svl_tti_recall_top1 0.2939
  SimpleImageCaptions112 svl_tit_recall_top1 0.2549
  SimpleImageCaptions112 svl_tii_recall_top1 0.2126
  SimpleImageCaptions112 svl_itt_recall_top1 0.2873
  SimpleImageCaptions112 svl_iti_recall_top1 0.2201
  SimpleImageCaptions112 svl_iit_recall_top1 0.207
  SimpleImageCaptions112 svl_iii_recall_top1 0.1628


training 1 epoch: 100%|██████████| 2586/2586 [02:54<00:00, 15.79it/s]


Epoch 48 Elapsed time 207.9135
    total training loss 2.026
    extract 2.1293
    joint_embedding 0.2316
    compose2 0.7109
    compose1 0.1788


training 1 epoch:   0%|          | 0/2586 [00:00<?, ?it/s]

  CocoCapTrain text2image_recall_top1 0.3723
  SimpleImageCaptions112 text2image_recall_top1 0.3434
  SimpleImageCaptions112 svl_ttt_recall_top1 0.3013
  SimpleImageCaptions112 svl_tti_recall_top1 0.2793
  SimpleImageCaptions112 svl_tit_recall_top1 0.2481
  SimpleImageCaptions112 svl_tii_recall_top1 0.2152
  SimpleImageCaptions112 svl_itt_recall_top1 0.2847
  SimpleImageCaptions112 svl_iti_recall_top1 0.2199
  SimpleImageCaptions112 svl_iit_recall_top1 0.2114
  SimpleImageCaptions112 svl_iii_recall_top1 0.1614


training 1 epoch:  56%|█████▌    | 1453/2586 [01:38<01:13, 15.49it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

