Skip to content
Permalink
Browse files

Updated code to reflect release version of data format (#6)

Updated code to reflect release version of data format. Fixed bug in disjoint initialization.
  • Loading branch information...
BryanPlummer authored and mvasil committed Dec 24, 2018
1 parent 13fdaf1 commit 299b426e38b92b4441534e025bf84caa0ea3155b
Showing with 202 additions and 159 deletions.
  1. 0 LICENSE
  2. +16 −5 README.md
  3. 0 Resnet_18.py
  4. +80 −79 main.py
  5. +99 −70 polyvore_outfits.py
  6. +6 −4 tripletnet.py
  7. +1 −1 type_specific_network.py
0 LICENSE 100755 → 100644
No changes.
@@ -13,21 +13,32 @@ This code was tested on an Ubuntu 16.04 system using Pytorch version 0.1.12. It


## Usage
You can download the Polyvore Outfits dataset including the splits and questions for the compatibility and fill-in-the-blank tasks from [here](). After unpacking the dataset make any necessary updates to the data root directory in polyvore_outfits.py.

Afterwards, you can train the model using `python main.py`. You can see a listing and description of many tuneable parameters with:
You can download the Polyvore Outfits dataset including the splits and questions for the compatibility and fill-in-the-blank tasks from [here (6G)](https://drive.google.com/file/d/13-J4fAPZahauaGycw3j_YvbAHO7tOTW5/view?usp=sharing). The code assumes you unpacked it in a dictory called `data`, but if you choose a different directory simply set the `--datadir` argument. You can see a listing and description of the model options with:

```sh
python main.py --help
```

For example, to learn masks used for the projections for each type specific embeddings, rather than the default which creates fixed masks, you would use:
## Using a pre-trained model

We have provided a pre-trained model for the nondisjoint data split which you can download [here (11M)](https://drive.google.com/file/d/1JrRgM_EaLQqLw1CNjM65XnTm9rZyLRgj/view?usp=sharing). This model learns diagonal projections from the general embedding to a type-specific compatibility space which is L2 normalized after appying the projection. You can test this model using:

```sh
python main.py --test --l2_embed --resume runs/nondisjoint_l2norm/model_best.pth.tar
```

This code includes some minor modifications resulting in better perfromance than the version used for our camera ready. For example, our pre-trained model should provide a compatibility AUC of 0.88 and fill-in-the-blank accuracy of 57.6, which is a little better than the 0.86 AUC/55.3 accuracy for our best model reported in our paper.

## Training a new model

To train the pre-trained model above we used the following command.

```sh
python main.py --name {your experiment name} --learned
python main.py --name {your experiment name} --learned --l2_embed
```

By default the code outputs the results on the test set after training. However, if you wanted to re-run the test for many settings you have to use the same flags during testing as you had during training. For example, if you trained with the `--use_fc` to train fully connected type-specific embeddings rather than a mask, at test time you would use:
By default the code outputs the results on the test set after training. However, if you wanted to re-run the test for many settings you have to use the same flags during testing as you had during training. For example, if you trained with the `--use_fc` to train fully connected type-specific embeddings rather than a (diagonal) mask, at test time you would use:

```sh
python main.py --test --use_fc --resume runs/{your experiment name}/model_best.pth.tar
0 Resnet_18.py 100755 → 100644
No changes.
159 main.py 100755 → 100644
@@ -3,17 +3,24 @@
import os
import sys
import shutil
import json

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
import torch.backends.cudnn as cudnn

import Resnet_18
from polyvore_outfits import TripletImageLoader
from tripletnet import Tripletnet
import numpy as np
import Resnet_18
from type_specific_network import TypeSpecificNet


# Training settings
parser = argparse.ArgumentParser(description='Fashion Compatibility Example')
parser.add_argument('--batch-size', type=int, default=256, metavar='N',
@@ -56,17 +63,17 @@
help='L2 normalize the output of the type specific embeddings')
parser.add_argument('--learned_metric', dest='learned_metric', action='store_true', default=False,
help='Learn a distance metric rather than euclidean distance')
parser.add_argument('--margin', type=float, default=0.2, metavar='M',
parser.add_argument('--margin', type=float, default=0.3, metavar='M',
help='margin for triplet loss (default: 0.2)')
parser.add_argument('--embed_loss', type=float, default=5e-4, metavar='M',
help='parameter for loss for embedding norm')
parser.add_argument('--mask_loss', type=float, default=5e-4, metavar='M',
help='parameter for loss for mask norm')
parser.add_argument('--vse_loss', type=float, default=5e-4, metavar='M',
parser.add_argument('--vse_loss', type=float, default=5e-3, metavar='M',
help='parameter for loss for the visual-semantic embedding')
parser.add_argument('--sim_t_loss', type=float, default=5e-1, metavar='M',
parser.add_argument('--sim_t_loss', type=float, default=5e-5, metavar='M',
help='parameter for loss for text-text similarity')
parser.add_argument('--sim_i_loss', type=float, default=5e-1, metavar='M',
parser.add_argument('--sim_i_loss', type=float, default=5e-5, metavar='M',
help='parameter for loss for image-image similarity')

def main():
@@ -76,23 +83,35 @@ def main():
torch.manual_seed(args.seed)
if args.cuda:
torch.cuda.manual_seed(args.seed)

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])

fn = os.path.join(args.datadir, 'polyvore_outfits', 'polyvore_item_metadata.json')
meta_data = json.load(open(fn, 'r'))
text_feature_dim = 6000
kwargs = {'num_workers': 8, 'pin_memory': True} if args.cuda else {}
test_loader = torch.utils.data.DataLoader(
TripletImageLoader(args, 'test',
transform=transforms.Compose([
transforms.Scale(112),
transforms.CenterCrop(112),
transforms.ToTensor(),
normalize,
])),
TripletImageLoader(args, 'test', meta_data,
transform=transforms.Compose([
transforms.Scale(112),
transforms.CenterCrop(112),
transforms.ToTensor(),
normalize,
])),
batch_size=args.batch_size, shuffle=False, **kwargs)

model = Resnet_18.resnet18(pretrained=True, embedding_size=args.dim_embed)
csn_model = TypeSpecificNet(args, model, len(test_loader.dataset.typespaces))

criterion = torch.nn.MarginRankingLoss(margin = args.margin)
tnet = Tripletnet(args, csn_model, text_feature_dim, criterion)
if args.cuda:
tnet.cuda()

train_loader = torch.utils.data.DataLoader(
TripletImageLoader(args, 'train',
TripletImageLoader(args, 'train', meta_data,
text_dim=text_feature_dim,
transform=transforms.Compose([
transforms.Scale(112),
transforms.CenterCrop(112),
@@ -102,23 +121,14 @@ def main():
])),
batch_size=args.batch_size, shuffle=True, **kwargs)
val_loader = torch.utils.data.DataLoader(
TripletImageLoader(args, 'valid',
TripletImageLoader(args, 'valid', meta_data,
transform=transforms.Compose([
transforms.Scale(112),
transforms.CenterCrop(112),
transforms.ToTensor(),
normalize,
])),
batch_size=args.batch_size, shuffle=False, **kwargs)

text_dim = train_loader.dataset.text_feat_dim
n_conditions = len(train_loader.dataset.typespaces)
model = Resnet_18.resnet18(pretrained=True, embedding_size=args.dim_embed)
tsn_model = TypeSpecificNet(args, model, n_conditions)
criterion = torch.nn.MarginRankingLoss(margin = args.margin)
tnet = Tripletnet(args, tsn_model, text_dim, criterion)
if args.cuda:
tnet.cuda()

best_acc = 0
# optionally resume from a checkpoint
@@ -134,7 +144,7 @@ def main():
else:
print("=> no checkpoint found at '{}'".format(args.resume))

cudnn.benchmark = True
cudnn.benchmark = True
if args.test:
test_acc = test(test_loader, tnet)
sys.exit()
@@ -148,7 +158,7 @@ def main():
# update learning rate
adjust_learning_rate(optimizer, epoch)
# train for one epoch
train(train_loader, tnet, optimizer, epoch)
train(train_loader, tnet, criterion, optimizer, epoch)
# evaluate on validation set
acc = test(val_loader, tnet)

@@ -161,48 +171,55 @@ def main():
'best_prec1': best_acc,
}, is_best)

checkpoint = torch.load('runs/%s/'%(args.name) + 'model_best.pth.tar')
tnet.load_state_dict(checkpoint['state_dict'])
test_acc = test(test_loader, tnet)

def train(train_loader, tnet, optimizer, epoch):
def train(train_loader, tnet, criterion, optimizer, epoch):
losses = AverageMeter()
accs = AverageMeter()
emb_norms = AverageMeter()
mask_norms = AverageMeter()

# switch to train mode
tnet.train()

# on the train split the data loader returns text and image data
for batch_idx, (img1, desc1, has_text1, img2, desc2, has_text2, img3, desc3, has_text3, condition) in enumerate(train_loader):
anchor = TrainData(img1, desc1, has_text1, condition)
close = TrainData(img2, desc2, has_text2)
far = TrainData(img3, desc3, has_text3)
# compute output
acc, loss_triplet, loss_embed, loss_mask, loss_vse, loss_sim_t, loss_sim_i = tnet(anchor, far, close)

# encorages similar text inputs (sim_t) and image inputs (sim_i) to
# compute output
acc, loss_triplet, loss_mask, loss_embed, loss_vse, loss_sim_t, loss_sim_i = tnet(anchor, far, close)

# encorages similar text inputs (sim_t) and image inputs (sim_i) to
# embed close to each other, images operate on the general embedding
loss_sim = args.sim_t_loss * loss_sim_t + args.sim_i_loss * loss_sim_i

# cross-modal similarity regularizer on the general embedding
loss_vse_w = args.vse_loss * loss_vse

# sparsity and l2 regularizer
loss_reg = args.embed_loss * loss_embed + args.mask_loss * loss_mask

loss = loss_triplet + loss_reg + loss_vse_w + loss_sim

loss = loss_triplet + loss_reg
if args.vse_loss > 0:
loss += loss_vse_w
if args.sim_t_loss > 0 or args.sim_i_loss > 0:
loss += loss_sim

num_items = len(anchor)
# measure accuracy and record loss
losses.update(loss_triplet.data[0], num_items)
accs.update(acc.data[0], num_items)
emb_norms.update(loss_embed.data[0])
mask_norms.update(loss_mask.data[0])

# compute gradient and do optimizer step
optimizer.zero_grad()
loss.backward()
optimizer.step()

if loss == loss:
loss.backward()
optimizer.step()

if batch_idx % args.log_interval == 0:
print('Train Epoch: {} [{}/{}]\t'
@@ -213,6 +230,7 @@ def train(train_loader, tnet, optimizer, epoch):
losses.val, losses.avg,
100. * accs.val, 100. * accs.avg, emb_norms.val, emb_norms.avg))


def test(test_loader, tnet):
# switch to evaluation mode
tnet.eval()
@@ -224,16 +242,16 @@ def test(test_loader, tnet):
images = images.cuda()
images = Variable(images)
embeddings.append(tnet.embeddingnet(images).data)

embeddings = torch.cat(embeddings)
metric = tnet.metric_branch
auc = test_loader.dataset.test_compatibility(embeddings, metric)
acc = test_loader.dataset.test_fitb(embeddings, metric)
total = auc + acc
print('\n{} set: Compat AUC: {:.2f} FITB: {:.1f}\n'.format(
test_loader.dataset.split,
test_loader.dataset.split,
round(auc, 2), round(acc * 100, 1)))

return total

def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
@@ -246,47 +264,13 @@ def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
if is_best:
shutil.copyfile(filename, 'runs/%s/'%(args.name) + 'model_best.pth.tar')

class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()

def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0

def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count

def convert_train_data(images, text, has_text, conditions = None):
has_text = has_text.float()
if args.cuda:
images, text, has_text = images.cuda(), text.cuda(), has_text.cuda()
images, text, has_text = Variable(images), Variable(text), Variable(has_text)

data = {'images' : images, 'text' : text, 'has_text' : has_text}

if conditions is not None:
if args.cuda:
conditions = conditions.cuda()
conditions = Variable(conditions)

data['conditions'] = conditions

return images, text, has_text, conditions

class TrainData():
def __init__(self, images, text, has_text, conditions = None):
has_text = has_text.float()
if args.cuda:
images, text, has_text = images.cuda(), text.cuda(), has_text.cuda()
images, text, has_text = Variable(images), Variable(text), Variable(has_text)

if conditions is not None and not args.use_fc:
if args.cuda:
conditions = conditions.cuda()
@@ -297,10 +281,27 @@ def __init__(self, images, text, has_text, conditions = None):
self.text = text
self.has_text = has_text
self.conditions = conditions

def __len__(self):
return self.images.size(0)

class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()

def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0

def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count

def adjust_learning_rate(optimizer, epoch):
"""Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
lr = args.lr * ((1 - 0.015) ** epoch)

0 comments on commit 299b426

Please sign in to comment.
You can’t perform that action at this time.