In [1]:
import csv
import json
import os
import pickle as pkl
import pprint

import lmdb
import numpy as np
import PIL
import torchutils
from tqdm.auto import tqdm


## Outfit dataset
Outfits are saved in three splits:

```json
train: "train_no_dup.json",
valid: "valid_no_dup.json",
test: "test_no_dup.json",
```

Each outfit in the `json` file has the following keys:

```json
['name', 'views', 'items', 'image', 'likes', 'date', 'set_url', 'set_id', 'desc']
```

Each item in an outfit has the following keys:

```json
['index', 'name', 'prices', 'likes', 'image', 'categoryid']
```


In [2]:
def get_item_type(item):
    return item["index"] - 1


def get_item_id(item):
    return item["image"].split("tid=")[-1]


def load_json(fn):
    with open(fn, "r") as f:
        data = json.load(f)
    return data


In [3]:
inputDir = "release"
outputDir = "processed"

CHECK_IMAGES = False
MAKE_LMDB = False


## Load outfits

In [4]:
trainOutfits = load_json(f"{inputDir}/label/train_no_dup.json")
validOutfits = load_json(f"{inputDir}/label/valid_no_dup.json")
testOutfits = load_json(f"{inputDir}/label/test_no_dup.json")

allOutfits = trainOutfits + validOutfits + testOutfits
print(
    "Number of outfits: {} = {} + {} + {}".format(
        len(allOutfits), len(trainOutfits), len(validOutfits), len(testOutfits)
    )
)


Number of outfits: 21889 = 17316 + 1497 + 3076


In [5]:
print("Example outfit:")
pprint.pprint(allOutfits[0], depth=1)
print("Example item:")
pprint.pprint(allOutfits[0]["items"][0])



Example outfit:
{'date': 'One month',
 'desc': 'A fashion look from January 2017 by beebeely-look featuring Fuji, '
         'Citizens of Humanity, casual, casualoutfit, Packandgo, winterjacket '
         'and gearbest',
 'image': 'http://ak1.polyvoreimg.com/cgi/img-set/cid/214181831/id/El8a99fQ5hG4HrPFO4xqOQ/size/y.jpg',
 'items': [...],
 'likes': 394,
 'name': 'Casual',
 'set_id': '214181831',
 'set_url': 'http://www.polyvore.com/casual/set?id=214181831',
 'views': 8743}
Example item:
{'categoryid': 4495,
 'image': 'http://img2.polyvoreimg.com/cgi/img-thing?.out=jpg&size=m&tid=194508109',
 'index': 1,
 'likes': 10,
 'name': 'mock neck embroidery suede sweatshirt',
 'price': 24.0}


In [6]:
MAX_SIZE = 8

itemSet = set()
itemImages = dict()
categorySet = set()
outfitSet = set()
# {set_id}_{index}: {item_id}
name2Id = dict()

# item index in item list
itemReIndex = []
itemList = [set() for _ in range(MAX_SIZE)]

num_items = 0
for outfit in allOutfits:
    set_id = outfit["set_id"]
    outfitSet.add(set_id)
    num_items += len(outfit["items"])
    for item in outfit["items"]:
        categorySet.add(item["categoryid"])
        index = item["index"]
        name = "{}_{}".format(set_id, index)
        # the unique id of item
        item_id = get_item_id(item)
        item_type = get_item_type(item)
        name2Id[name] = item_id
        itemSet.add(item_id)
        # use index as item category
        itemList[item_type].add(item_id)
        if item_id not in itemImages:
            itemImages[item_id] = []
        itemImages[item_id].append(os.path.join(f"{inputDir}/images", "{}/{}.jpg".format(set_id, index)))

print("Number of unique items: {:,}".format(len(itemSet)))
print("Reuse ratio: {:.3f} = {:,} / {:,}".format(num_items / len(itemSet), num_items, len(itemSet)))
print("Number of categories: {}".format(len(categorySet)))
print("Average number of items in an outfit: {:.2f}".format(num_items / len(allOutfits)))


Number of unique items: 111,589
Reuse ratio: 1.277 = 142,480 / 111,589
Number of categories: 380
Average number of items in an outfit: 6.51


## Create item list

In [7]:
# convert to item list
itemReIndex = []
itemList = [list(items) for items in itemList]
for i in range(MAX_SIZE):
    items = itemList[i]
    item_index = {item_id: i for i, item_id in enumerate(items)}
    itemReIndex.append(item_index)

torchutils.io.save_json(f"{outputDir}/original/items.json", itemList)


## Check items

Since one image maybe used multiple times. Check whether the content is the same.

In [8]:
# all compatibility outfits are in allOutfits
# name format: {set_id}_{item_index}

with open(f"{inputDir}/label/fashion_compatibility_prediction.txt") as f:
    lines = f.readlines()
for line in lines:
    set_id = line.split(" ")[1].split("_")[0]
    assert set_id in outfitSet

# all fitb outfits are in allOutfits
# name format: {set_id}_{item_index}
# {"question": [names, ...], "answers": [names, ], "blank_position": n}
with open(f"{inputDir}/label/fill_in_blank_test.json") as f:
    data = json.load(f)
for d in data:
    position = d["blank_position"]
    question = d["question"]
    for q in question:
        set_id = q.split("_")[0]
        assert set_id in outfitSet


In [9]:
def check_reuse_images():
    itemImages = dict()
    for k, v in tqdm(itemImages.items()):
        if len(v) > 1:
            images = []
            for fn in v:
                with open(fn, "rb") as f:
                    images.append(np.array(PIL.Image.open(f).convert("RGB")))
            itemImages[k] = images

    error = dict()
    for k, v in tqdm(itemImages.items()):
        imgs = np.stack(v)
        mean = (imgs - imgs.mean(axis=0)).mean()
        error[k] = mean
    print("Mean error: {:.3f}".format(np.array(list(error.values())).mean()))


if CHECK_IMAGES:
    check_reuse_images()


## Convert all images to LMDB format

In [10]:
if MAKE_LMDB:
    dst = f"{outputDir}/features/images"
    env = lmdb.open(dst, map_size=2 ** 40)
    # open json file
    with env.begin(write=True) as txn:
        for item_id, item_path in tqdm(itemImages.items()):
            fn = item_path[0]
            with open(fn, "rb") as f:
                img_data = f.read()
                txn.put(item_id.encode("ascii"), img_data)
    env.close()


## Extract Words informations

In [11]:
words = dict()
with open(f"{inputDir}/final_word_dict.txt") as f:
    for l in f.readlines():
        k, v = l.strip().split()
        words[k] = int(v)
wordDict = dict()
for w in words:
    wordDict[w] = len(wordDict)
print("Number of words: {}, plus 1 unkown".format(len(wordDict)))


Number of words: 2756, plus 1 unkown


In [12]:
itemWords = dict()
for outfit in allOutfits:
    for item in outfit["items"]:
        item_id = item["image"].split("tid=")[-1]
        embd = [0] * (len(wordDict) + 1)
        for w in item["name"].split():
            if w in wordDict:
                embd[wordDict[w]] += 1
            else:
                embd[-1] += 1
        itemWords[item_id] = np.array(embd)
with open(f"{outputDir}/word_embedding.pkl", "wb") as f:
    pkl.dump(itemWords, f)


## Convert to outfit to tuples

In [13]:
def convert_to_tuples(outfits):
    tuples = []
    for outfit in outfits:
        items = [-1] * MAX_SIZE
        types = [-1] * MAX_SIZE
        size = len(outfit["items"])
        for i, item in enumerate(outfit["items"]):
            tid = item["image"].split("tid=")[-1]
            item_type = item["index"] - 1
            item_index = itemReIndex[item_type][tid]
            items[i] = item_index
            types[i] = item_type
        tuples.append([0] + [size] + items + types)
    return np.array(tuples)


In [14]:
trainTuples = convert_to_tuples(trainOutfits)
validTuples = convert_to_tuples(validOutfits)
testTuples = convert_to_tuples(testOutfits)


In [15]:
with open(f"{outputDir}/original/train_pos", "w") as f:
    writer = csv.writer(f)
    writer.writerows(trainTuples)

with open(f"{outputDir}/original/valid_pos", "w") as f:
    writer = csv.writer(f)
    writer.writerows(validTuples)

with open(f"{outputDir}/original/test_pos", "w") as f:
    writer = csv.writer(f)
    writer.writerows(testTuples)

## Convert FITB and Negative tuples

In [16]:
def convert_compatibility(data):
    eval_pos = []
    eval_neg = []

    for line in data:
        label, *names = line.split()
        size = len(names)
        m = MAX_SIZE - size
        types = [int(name.split("_")[1]) - 1 for name in names]
        items = [itemReIndex[c][name2Id[i]] for i, c in zip(names, types)]
        tpl = [0, size] + items + [-1] * m + types + [-1] * m
        if int(label) == 1:
            eval_pos.append(tpl)
        else:
            eval_neg.append(tpl)
    eval_pos = np.array(eval_pos)
    eval_neg = np.array(eval_neg)
    return eval_pos, eval_neg

In [17]:
with open(f"{inputDir}/label/fashion_compatibility_prediction.txt") as f:
    lines = f.readlines()
eval_pos, eval_neg = convert_compatibility(lines)
# the positive tuples should match those in compatibility
assert (testTuples == eval_pos).all()
print("Number of positive outfits: {:,}".format(len(eval_pos)))
print("Number of negative outfits: {:,}".format(len(eval_neg)))


Number of positive outfits: 3,076
Number of negative outfits: 4,000


In [18]:
with open(f"{outputDir}/original/test_neg", "w") as f:
    writer = csv.writer(f)
    writer.writerows(eval_neg)

In [19]:
def convert_fitb(data):
    tuples = []
    for d in data:
        position = d["blank_position"]
        question = d["question"]
        question_types = [int(s.split("_")[1]) - 1 for s in question]
        question_items = [itemReIndex[c][name2Id[i]] for i, c in zip(question, question_types)]
        size = len(question) + 1
        m = MAX_SIZE - size
        for ans in d["answers"]:
            c = int(ans.split("_")[-1]) - 1
            i = itemReIndex[c][name2Id[ans]]
            items = question_items.copy()
            types = question_types.copy()
            items.insert(position - 1, i)
            types.insert(position - 1, c)
            tuples.append([0, size] + items + [-1] * m + types + [-1] * m)
    tuples = np.array(tuples)
    return tuples


In [20]:
with open(f"{inputDir}/label/fill_in_blank_test.json") as f:
    data = json.load(f)
    tuples = convert_fitb(data)

with open(f"{outputDir}/original/test_fitb", "w") as f:
    writer = csv.writer(f)
    writer.writerows(tuples)


## Tuples from type-aware embedding

In [21]:
dst_dir = f"{outputDir}/hardneg"
src_dir = f"{inputDir}/maryland_polyvore_hardneg/"

torchutils.io.save_json(f"{dst_dir}/items.json", itemList)


In [22]:
splits = ["train", "valid", "test"]
outfits = dict(train=trainTuples, valid=validTuples, test=testTuples)
for phase in splits:
    fn = os.path.join(src_dir, "compatibility_{}.txt".format(phase))
    with open(fn) as f:
        data = f.readlines()
    pos_tuples, neg_tuples = convert_compatibility(data)
    assert (pos_tuples == outfits[phase]).all()
    torchutils.io.save_csv(os.path.join(dst_dir, "{}_neg".format(phase)), neg_tuples)
    torchutils.io.save_csv(os.path.join(dst_dir, "{}_pos".format(phase)), pos_tuples)
    print("Number of positive outfits ({}): {:,}".format(phase, len(pos_tuples)))
    print("Number of negative outfits ({}): {:,}".format(phase, len(neg_tuples)))


Number of positive outfits (train): 17,316
Number of negative outfits (train): 17,192
Number of positive outfits (valid): 1,497
Number of negative outfits (valid): 1,467
Number of positive outfits (test): 3,076
Number of negative outfits (test): 3,005


In [23]:
splits = ["train", "valid", "test"]
for phase in splits:
    data = torchutils.io.load_json(os.path.join(src_dir, "fill_in_blank_{}.json".format(phase)))
    tuples = convert_fitb(data)
    torchutils.io.save_csv(os.path.join(dst_dir, "{}_fitb".format(phase)), tuples)
    print("Number of questions ({}): {:,}".format(phase, len(tuples) // 4))


Number of questions (train): 17,316
Number of questions (valid): 1,497
Number of questions (test): 3,076
