In [1]:
import argparse
import os
import time
import sys

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import numpy as np
from torch.utils.data import DataLoader
import torch.optim as optim
import fastText

os.environ['CUDA_VISIBLE_DEVICES']="4,5,6,7"

In [2]:
from models import models
from dataset import openimages
from utils.loss import HardNegativeContrastiveLoss

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
class AverageMeter(object):

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [5]:
def train(train_loader, model, criterion, optimizer, epoch, print_freq=1000):
    #amp_handle = amp.init()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    model = model.train()
    print("Start training")
    end = time.time()
    for i, (imgs, caps) in enumerate(train_loader):
        if i%2 == 1:
                print("%2.2f"% (i/len(train_loader)*100), '\%', end='\r')
        input_imgs, target = imgs.cuda(), caps.cuda()
        

        data_time.update(time.time() - end)

        optimizer.zero_grad()
        
        output_imgs = model(input_imgs)
        
        
        loss = criterion(output_imgs, target)
        
        #with amp_handle.scale_loss(loss, optimizer) as scaled_loss:
        #    scaled_loss.backward()
        loss.backward()
        optimizer.step()
        
        losses.update(loss.item(), imgs.size(0))

        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0 or i == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                      epoch, i, len(train_loader), batch_time=batch_time,
                      data_time=data_time, loss=losses))

    return losses.avg, batch_time.avg, data_time.avg

In [6]:
def validate(val_loader, model, criterion, print_freq=1000):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    model = model.eval()

    imgs_enc = list()
    caps_enc = list()
    end = time.time()
    for i, (imgs, caps, lengths) in enumerate(val_loader):

        input_imgs, input_caps = imgs.cuda(), caps.cuda()

        # measure data loading time
        data_time.update(time.time() - end)

        with torch.no_grad():
            output_imgs = model(input_imgs)
            loss = criterion(output_imgs, input_caps)

        imgs_enc.append(output_imgs.cpu().data.numpy())
        caps_enc.append(output_caps.cpu().data.numpy())
        losses.update(loss.item(), imgs.size(0))

        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0 or i == (len(val_loader) - 1):
            print('Data: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                      i, len(val_loader), batch_time=batch_time,
                      data_time=data_time, loss=losses))

    recall  = eval_recall(imgs_enc, caps_enc)
    print(recall)
    return losses.avg, batch_time.avg, data_time.avg, recall

In [7]:
normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

prepro = transforms.Compose([
    transforms.RandomResizedCrop(224),

    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    normalize,
])

prepro_val = transforms.Compose([
    transforms.Resize((350, 350)),
    transforms.ToTensor(),
    normalize,
])

In [8]:
m = nn.DataParallel(models.ImageProjection().train().cuda())

In [9]:
for params in m.parameters():
    params.requires_grad=False

In [10]:
for params in m.module.projection.parameters():
    params.requires_grad=True

In [11]:
def collate_embeds(data):
    images, targets = zip(*data)
    images = torch.stack(images, 0)
    targets = torch.Tensor(np.stack(targets, 0))

    return images, targets

In [12]:
embed = fastText.load_model("/data/m.portaz/wiki.en.bin")
train_dataset = openimages.OpenImagesText(image_dir="/data/datasets/openimages/images/train/", 
                          dataset_file="/data/datasets/openimages/train-words.csv",
                          embeddings=embed, 
                          transform=prepro, random=0.5)

Reading dataset file
Done reading  4593616  lines.


In [20]:
train_loader = DataLoader(train_dataset, batch_size=3072, shuffle=True, drop_last=True,
                            num_workers=20, collate_fn=collate_embeds, pin_memory=True)

In [14]:
opti = optim.Adam(filter(lambda p: p.requires_grad, m.parameters()), lr=0.001)

In [16]:
criterion = HardNegativeContrastiveLoss().cuda()

In [17]:
train(train_loader, m, criterion, opti, 0, print_freq=50)

Start training
Epoch: [0][0/2191]	Time 58.837 (58.837)	Data 32.222 (32.222)	Loss 5065.3916 (5065.3916)	
Epoch: [0][50/2191]	Time 1.888 (2.689)	Data 0.115 (0.753)	Loss 8174.1582 (9939.4797)	
Epoch: [0][100/2191]	Time 1.354 (2.066)	Data 0.116 (0.443)	Loss 7127.2090 (8879.8595)	
Epoch: [0][150/2191]	Time 1.324 (1.865)	Data 0.117 (0.339)	Loss 9998.9590 (9019.7205)	
Epoch: [0][200/2191]	Time 1.328 (1.758)	Data 0.115 (0.287)	Loss 8123.5645 (8971.5343)	
Epoch: [0][250/2191]	Time 1.327 (1.692)	Data 0.116 (0.255)	Loss 6510.5483 (8671.2556)	
Epoch: [0][300/2191]	Time 1.333 (1.649)	Data 0.116 (0.233)	Loss 6361.2222 (8354.4636)	
Epoch: [0][350/2191]	Time 1.327 (1.615)	Data 0.116 (0.217)	Loss 6979.4229 (8109.9479)	
Epoch: [0][400/2191]	Time 1.858 (1.595)	Data 0.116 (0.205)	Loss 8735.3262 (8153.7529)	
Epoch: [0][450/2191]	Time 1.328 (1.577)	Data 0.116 (0.196)	Loss 8741.9980 (8163.0154)	
Epoch: [0][500/2191]	Time 1.459 (1.560)	Data 0.247 (0.189)	Loss 7970.5630 (8180.6867)	
Epoch: [0][550/2191]	Time 1

(7745.976734546155, 1.4532104864995374, 0.1383018260783444)

In [18]:
opti = optim.Adam(filter(lambda p: p.requires_grad, m.parameters()), lr=0.0005)

In [19]:
train(train_loader, m, criterion, opti, 0, print_freq=50)

Start training
Epoch: [0][0/2191]	Time 21.584 (21.584)	Data 20.291 (20.291)	Loss 6896.0942 (6896.0942)	
Epoch: [0][50/2191]	Time 1.672 (2.120)	Data 0.117 (0.745)	Loss 4490.7168 (5408.1094)	
Epoch: [0][100/2191]	Time 1.332 (1.780)	Data 0.119 (0.440)	Loss 4052.2847 (4848.7612)	
Epoch: [0][150/2191]	Time 1.326 (1.660)	Data 0.116 (0.336)	Loss 3058.9360 (4495.5497)	
Epoch: [0][200/2191]	Time 1.331 (1.602)	Data 0.116 (0.283)	Loss 2993.4958 (4163.4350)	
Epoch: [0][250/2191]	Time 1.330 (1.565)	Data 0.117 (0.251)	Loss 2312.2651 (3845.6544)	
Epoch: [0][300/2191]	Time 1.348 (1.540)	Data 0.116 (0.229)	Loss 2224.4983 (3577.2567)	
Epoch: [0][350/2191]	Time 1.423 (1.522)	Data 0.115 (0.213)	Loss 2083.6863 (3367.7276)	
Epoch: [0][400/2191]	Time 1.754 (1.510)	Data 0.116 (0.203)	Loss 2368.5754 (3219.6241)	
Epoch: [0][450/2191]	Time 1.339 (1.500)	Data 0.120 (0.193)	Loss 2379.7559 (3117.2281)	
Epoch: [0][500/2191]	Time 1.328 (1.492)	Data 0.116 (0.186)	Loss 3292.6516 (3059.0133)	
Epoch: [0][550/2191]	Time 1

(3886.87468471205, 1.4323339348022168, 0.13734619381328741)

In [21]:
for param in m.parameters():
    param.requires_grad = True

# Keep the first layer of resnet frozen
for i in range(0, 6):
    for param in m.module.base_layer[i].parameters():
        param.requires_grad = False

In [28]:
train_loader = DataLoader(train_dataset, batch_size=450, shuffle=True, drop_last=True,
                            num_workers=20, collate_fn=collate_embeds, pin_memory=True)
opti = optim.Adam(filter(lambda p: p.requires_grad, m.module.parameters()), lr=0.00005)

In [None]:
for _ in range(5):
    train(train_loader, m, criterion, opti, i, print_freq=100)
    i += 1

Start training
Epoch: [8][0/10208]	Time 15.876 (15.876)	Data 14.522 (14.522)	Loss 249.2530 (249.2530)	
Epoch: [8][100/10208]	Time 0.724 (0.927)	Data 0.026 (0.171)	Loss 226.3943 (239.8907)	
Epoch: [8][200/10208]	Time 0.724 (0.868)	Data 0.026 (0.099)	Loss 228.6726 (235.6185)	
Epoch: [8][300/10208]	Time 0.725 (0.838)	Data 0.026 (0.075)	Loss 230.6595 (233.0092)	
Epoch: [8][400/10208]	Time 0.725 (0.822)	Data 0.026 (0.063)	Loss 225.4995 (231.3337)	
Epoch: [8][500/10208]	Time 0.945 (0.813)	Data 0.026 (0.056)	Loss 217.3611 (229.5482)	
Epoch: [8][600/10208]	Time 0.730 (0.809)	Data 0.026 (0.051)	Loss 215.5688 (228.3176)	
Epoch: [8][700/10208]	Time 0.731 (0.803)	Data 0.026 (0.047)	Loss 227.7986 (227.4418)	
Epoch: [8][800/10208]	Time 0.763 (0.801)	Data 0.025 (0.045)	Loss 213.1459 (226.4875)	
Epoch: [8][900/10208]	Time 0.735 (0.798)	Data 0.026 (0.043)	Loss 219.0038 (225.6315)	
Epoch: [8][1000/10208]	Time 0.828 (0.797)	Data 0.027 (0.041)	Loss 214.3507 (224.8640)	
Epoch: [8][1100/10208]	Time 0.764 (0

In [None]:
embed = fastText.load_model("/data/m.portaz/wiki.en.bin")
train_dataset = openimages.OpenImagesText(image_dir="/data/datasets/openimages/images/train/", 
                          dataset_file="/data/datasets/openimages/train-words.csv",
                          embeddings=embed, 
                          transform=prepro, random=0.1)