In [None]:
import os

import numpy as np
import pandas as pd
import torchutils


In [None]:
inputDir = "release"
dataSet = "tuples_630"
outputDir = "processed"
saveFile = False

os.makedirs(f"{outputDir}/original/{dataSet}", exist_ok=True)

In [None]:
def load_image_list(fn):
    with open(fn) as f:
        lines = f.read().strip().split()
    return lines


image_list = [
    load_image_list(os.path.join(f"{inputDir}/{dataSet}/image_list_top")),
    load_image_list(os.path.join(f"{inputDir}/{dataSet}/image_list_bottom")),
    load_image_list(os.path.join(f"{inputDir}/{dataSet}/image_list_shoe")),
]
if saveFile:
    torchutils.io.save_json(f"{outputDir}/original/{dataSet}/items.json", image_list)

In [None]:
imageReader = torchutils.data.ImageLMDBReader(f"{outputDir}/features/images")

In [None]:
image = np.array(imageReader(image_list[0][0]))
print("Image shape: {}".format(image.shape))

In [None]:
def rearrange(items, types):
    new_items, new_types = [], []
    for item_id, item_type in zip(items, types):
        if item_type == -1:
            continue
        new_items.append(item_id)
        new_types.append(item_type)
    while len(new_items) < len(items):
        new_items.append(-1)
        new_types.append(-1)
    return new_items + new_types


def convert_tuples(data: np.ndarray):
    uidx, tuples = data[:, 0], data[:, 1:]
    n, m = tuples.shape
    if m == 3:
        types = np.array([0, 1, 2]).reshape((1, -1)).repeat(n, axis=0)
    else:
        types = np.array([0, 0, 1, 2]).reshape((1, -1)).repeat(n, axis=0)
    types = np.where(tuples == -1, tuples, types)
    size = np.sum(types != -1, axis=1)
    converted = []
    for i in range(n):
        converted.append([uidx[i], size[i]] + rearrange(tuples[i], types[i]))
    return np.array(converted)



In [None]:
data = np.array(pd.read_csv(f"{inputDir}/{dataSet}/tuples_train_posi", dtype=np.int64))
converted = convert_tuples(data)
if saveFile:
    torchutils.io.save_csv(f"{outputDir}/original/{dataSet}/train_pos", converted)

data = np.array(pd.read_csv(f"{inputDir}/{dataSet}/tuples_train_nega", dtype=np.int64))
converted = convert_tuples(data)
if saveFile:
    torchutils.io.save_csv(f"{outputDir}/original/{dataSet}/train_neg", converted)


In [None]:
data = np.array(pd.read_csv(f"{inputDir}/{dataSet}/tuples_val_posi", dtype=np.int64))
converted = convert_tuples(data)
if saveFile:
    torchutils.io.save_csv(f"{outputDir}/original/{dataSet}/valid_pos", converted)

data = np.array(pd.read_csv(f"{inputDir}/{dataSet}/tuples_val_nega", dtype=np.int64))
converted = convert_tuples(data)
if saveFile:
    torchutils.io.save_csv(f"{outputDir}/original/{dataSet}/valid_neg", converted)


In [None]:
data = np.array(pd.read_csv(f"{inputDir}/{dataSet}/tuples_test_posi", dtype=np.int64))
converted = convert_tuples(data)
if saveFile:
    torchutils.io.save_csv(f"{outputDir}/original/{dataSet}/test_pos", converted)

data = np.array(pd.read_csv(f"{inputDir}/{dataSet}/tuples_test_nega", dtype=np.int64))
converted = convert_tuples(data)
if saveFile:
    torchutils.io.save_csv(f"{outputDir}/original/{dataSet}/test_neg", converted)


In [None]:
data = np.array(pd.read_csv(f"{inputDir}/{dataSet}/fill_in_blank_test", dtype=np.int64))
num_answers = 4
num_questions = data.shape[0]
num_columns = data.shape[1] // num_answers
pos = data[:, :num_columns]
neg = data[:, num_columns:].reshape((num_questions * (num_answers - 1), num_columns))
pos = convert_tuples(pos)
neg = convert_tuples(neg)

data = data.reshape((num_questions * num_answers, -1))
converted = convert_tuples(data)
if saveFile:
    # torchutils.io.save_csv(f"{outputDir}/original/{dataSet}/test_fitb", converted)
    torchutils.io.save_csv(f"{outputDir}/original/{dataSet}/test_pos_fitb", pos)
    torchutils.io.save_csv(f"{outputDir}/original/{dataSet}/test_neg_fitb", neg)
